In [1]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer,CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pandas as pd

data=pd.read_csv('dataMining.csv')
X_train,X_test,Y_train,Y_test=train_test_split(data['content'],data['class'],test_size=float(1/3),random_state=42)



#training with using gini index for decision tree

train_gini=DecisionTreeClassifier(criterion='gini',max_depth=3)
gini_index=Pipeline([('vect_gini',TfidfVectorizer()),('tdidf_gini',TfidfTransformer()),('clfg',train_gini)])
gini_index.fit(X_train,Y_train)
test_pred_gini=gini_index.predict(X_test)
print("accuracy:")
print(accuracy_score(Y_test,test_pred_gini))
#Confusion Matrix is used to understand the trained classifier behavior over the test dataset or validate dataset.
print("confusion_matrix: ")
print(confusion_matrix(Y_test,test_pred_gini))
print("classification_report:")
print(classification_report(Y_test,test_pred_gini))

accuracy:
0.8253968253968254
confusion_matrix: 
[[10  0  5  0]
 [ 0 11  3  1]
 [ 0  0 14  0]
 [ 0  0  2 17]]
classification_report:
              precision    recall  f1-score   support

     english       1.00      0.67      0.80        15
      french       1.00      0.73      0.85        15
      german       0.58      1.00      0.74        14
     spanish       0.94      0.89      0.92        19

   micro avg       0.83      0.83      0.83        63
   macro avg       0.88      0.82      0.83        63
weighted avg       0.89      0.83      0.83        63



In [17]:
#training with using entropy (information gain) for decision tree

train_entropy=DecisionTreeClassifier(criterion='entropy',max_depth=2)
entropy=Pipeline([('vect_entropy',TfidfVectorizer()),('tdidf_entropy',TfidfTransformer()),('clfe',train_entropy)])
entropy.fit(X_train,Y_train)
test_pred_entropy=entropy.predict(X_test)
print("accuracy:")
print(accuracy_score(Y_test,test_pred_entropy))
print("confusion_matrix: ")
print(confusion_matrix(Y_test,test_pred_entropy))
print("classification_report:")
print(classification_report(Y_test,test_pred_entropy))

accuracy:
0.7301587301587301
confusion_matrix: 
[[10  0  5  0]
 [ 0  5  3  7]
 [ 0  0 14  0]
 [ 0  0  2 17]]
classification_report:
              precision    recall  f1-score   support

     english       1.00      0.67      0.80        15
      french       1.00      0.33      0.50        15
      german       0.58      1.00      0.74        14
     spanish       0.71      0.89      0.79        19

   micro avg       0.73      0.73      0.73        63
   macro avg       0.82      0.72      0.71        63
weighted avg       0.82      0.73      0.71        63



In [18]:
#classification with naive bayes method
"""in this section the gaussian naive bayes not working for text feature extracted becaues of  sparse 
inputs are not implemented in GaussianNB is that very sparse data almost certainly does not meet the 
assumptions of the algorithm â€“ when the bulk of the values are zero, a simple Gaussian is not a good 
fit to the data, and will almost never lead to a useful classification."""




from sklearn.naive_bayes import MultinomialNB as MNB

naive_bayes=Pipeline([('vect_naiveB',TfidfVectorizer()),('tdidf_naiveB',TfidfTransformer()),('clfNB',MNB())])
naive_bayes.fit(X_train,Y_train)
NB_pred=naive_bayes.predict(X_test)
print("accuracy:")
print(accuracy_score(Y_test,NB_pred))
print("confusion_matrix: ")
print(confusion_matrix(Y_test,NB_pred))
print("classification_report:")
print(classification_report(Y_test,NB_pred))

accuracy:
1.0
confusion_matrix: 
[[15  0  0  0]
 [ 0 15  0  0]
 [ 0  0 14  0]
 [ 0  0  0 19]]
classification_report:
              precision    recall  f1-score   support

     english       1.00      1.00      1.00        15
      french       1.00      1.00      1.00        15
      german       1.00      1.00      1.00        14
     spanish       1.00      1.00      1.00        19

   micro avg       1.00      1.00      1.00        63
   macro avg       1.00      1.00      1.00        63
weighted avg       1.00      1.00      1.00        63



In [2]:
#classification with KNearestNeighbors


from sklearn.neighbors import KNeighborsClassifier as KNN

KNN_clf=Pipeline([('vect_KNN',TfidfVectorizer()),('tfidf_KNN',TfidfTransformer()),('clf_KNN',KNN())])
KNN_clf.fit(X_train,Y_train)
KNN_pred=KNN_clf.predict(X_test)
print("accuracy:")
print(accuracy_score(Y_test,KNN_pred))
print("confusion_matrix: ")
print(confusion_matrix(Y_test,KNN_pred))
print("classification_report:")
print(classification_report(Y_test,KNN_pred))

accuracy:
1.0
confusion_matrix: 
[[15  0  0  0]
 [ 0 15  0  0]
 [ 0  0 14  0]
 [ 0  0  0 19]]
classification_report:
              precision    recall  f1-score   support

     english       1.00      1.00      1.00        15
      french       1.00      1.00      1.00        15
      german       1.00      1.00      1.00        14
     spanish       1.00      1.00      1.00        19

   micro avg       1.00      1.00      1.00        63
   macro avg       1.00      1.00      1.00        63
weighted avg       1.00      1.00      1.00        63

