In [2]:

import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBRFClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,classification_report

df = pd.read_excel("../../database/FinalTrainAbleDataset.xlsx",engine='openpyxl')

df.head(5)
x = df.stop_clean_body.values.astype('U')
y = df['target'].values


print("\n########## Random Forest Algorithm ###########")
RandomPipeLine = Pipeline([('tfidf', TfidfVectorizer(analyzer='word',ngram_range=(1,3))),
                  ('RandomCLf', RandomForestClassifier(n_estimators=100)) ])

X_train, X_test, y_train, y_test=train_test_split(x, y,test_size=0.20, random_state=0)
RandomPipeLine.fit(X_train,y_train)
y_pred=RandomPipeLine.predict(X_test)
score=metrics.accuracy_score(y_test, y_pred)
print(classification_report(y_test,y_pred))




########## Random Forest Algorithm ###########
               precision    recall  f1-score   support

         asia       0.97      0.88      0.92       107
   bangladesh       0.47      0.32      0.38       107
     business       0.80      0.86      0.83       107
       column       0.99      0.99      0.99       118
      cricket       0.83      0.93      0.87       112
        crime       0.54      0.82      0.65        90
     district       1.00      0.94      0.97        87
  durporobash       1.00      0.99      0.99        90
    education       0.94      0.96      0.95       107
entertainment       0.92      0.92      0.92       103
     football       0.88      0.92      0.90        95
        india       0.97      0.97      0.97       121
         life       0.92      0.73      0.81        96
     politics       0.82      0.94      0.88       107
 science-tech       0.87      0.87      0.87       110
       sports       0.95      0.83      0.88       105
        world   

In [3]:
print('######### Multi ############')
MultiPipeLine = Pipeline([('tfidf', TfidfVectorizer(analyzer='word',ngram_range=(1,3))),
                  ('Mulclf', MultinomialNB()) ])

X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.20, random_state=0)
MultiPipeLine.fit(X_train, y_train)
pred = MultiPipeLine.predict(X_test)
score=metrics.accuracy_score(y_test, pred)
print(classification_report(y_test,pred))


######### Multi ############
               precision    recall  f1-score   support

         asia       0.98      0.93      0.95       107
   bangladesh       0.86      0.06      0.11       107
     business       0.95      0.81      0.87       107
       column       0.38      1.00      0.55       118
      cricket       0.91      0.76      0.83       112
        crime       0.69      0.74      0.72        90
     district       0.71      0.97      0.82        87
  durporobash       0.92      1.00      0.96        90
    education       0.94      0.91      0.92       107
entertainment       0.96      0.89      0.92       103
     football       0.94      0.89      0.92        95
        india       0.99      0.95      0.97       121
         life       0.95      0.56      0.71        96
     politics       0.85      0.92      0.88       107
 science-tech       0.96      0.81      0.88       110
       sports       1.00      0.76      0.86       105
        world       0.99      0.93 

In [4]:

print('########### SVC ############')
SVCPipeLine = Pipeline([('tfidf', TfidfVectorizer(analyzer='word',ngram_range=(1,3))),
                  ('SVC', SVC()) ])

X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.20, random_state=0)
SVCPipeLine.fit(X_train, y_train)
pred = SVCPipeLine.predict(X_test)
score=metrics.accuracy_score(y_test, pred)
print(classification_report(y_test,pred))


########### SVC ############
               precision    recall  f1-score   support

         asia       0.96      0.90      0.93       107
   bangladesh       0.46      0.55      0.50       107
     business       0.92      0.87      0.89       107
       column       0.98      0.98      0.98       118
      cricket       0.91      0.93      0.92       112
        crime       0.60      0.83      0.70        90
     district       1.00      0.94      0.97        87
  durporobash       1.00      1.00      1.00        90
    education       0.96      0.94      0.95       107
entertainment       0.97      0.91      0.94       103
     football       0.90      0.96      0.93        95
        india       1.00      0.93      0.97       121
         life       0.88      0.82      0.85        96
     politics       0.91      0.94      0.93       107
 science-tech       0.92      0.90      0.91       110
       sports       1.00      0.81      0.89       105
        world       1.00      0.93 

In [5]:
print('########### KNN ############')
KNNPipeLine = Pipeline([('tfidf', TfidfVectorizer(analyzer='word',ngram_range=(1,3))),
                  ('KNN', KNeighborsClassifier()) ])

X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.20, random_state=0)
KNNPipeLine.fit(X_train, y_train)
pred = KNNPipeLine.predict(X_test)
score=metrics.accuracy_score(y_test, pred)
print(classification_report(y_test,pred))

########### KNN ############
               precision    recall  f1-score   support

         asia       0.82      0.78      0.80       107
   bangladesh       0.46      0.26      0.33       107
     business       0.72      0.78      0.74       107
       column       0.58      0.95      0.72       118
      cricket       0.79      0.79      0.79       112
        crime       0.67      0.38      0.48        90
     district       0.46      0.82      0.59        87
  durporobash       0.75      0.89      0.81        90
    education       0.86      0.91      0.88       107
entertainment       0.86      0.86      0.86       103
     football       0.84      0.73      0.78        95
        india       0.87      0.86      0.87       121
         life       0.64      0.54      0.59        96
     politics       0.85      0.79      0.82       107
 science-tech       0.80      0.74      0.77       110
       sports       0.79      0.50      0.61       105
        world       0.83      0.85 

In [6]:

print('########### SGD ############')
SGDPipeLine = Pipeline([('tfidf', TfidfVectorizer(analyzer='word',ngram_range=(1,3))),
                  ('SGDclf', SGDClassifier(max_iter=1000, tol=1e-3,n_jobs=2,penalty="l2")) ])

X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.20, random_state=0)
SGDPipeLine.fit(X_train, y_train)
pred = SGDPipeLine.predict(X_test)
score=metrics.accuracy_score(y_test, pred)
print(classification_report(y_test,pred))

########### SGD ############
               precision    recall  f1-score   support

         asia       0.94      0.96      0.95       107
   bangladesh       0.76      0.39      0.52       107
     business       0.89      0.95      0.92       107
       column       0.89      0.98      0.93       118
      cricket       0.93      0.99      0.96       112
        crime       0.63      0.81      0.71        90
     district       0.97      0.95      0.96        87
  durporobash       0.95      1.00      0.97        90
    education       0.94      0.97      0.95       107
entertainment       0.93      0.94      0.94       103
     football       0.89      0.98      0.93        95
        india       0.97      0.97      0.97       121
         life       0.90      0.76      0.82        96
     politics       0.88      0.97      0.92       107
 science-tech       0.89      0.93      0.91       110
       sports       1.00      0.84      0.91       105
        world       0.99      0.94 