## Installations

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Data Fetching

In [0]:
df= pd.read_csv('data_scraped6.csv')

## Trying for various Combinations

In [0]:
Combo = df["Title"] + df['Url_address']+df["Comments"] 
df = df.assign(Combo= Combo) 

In [0]:
X= df.iloc [:, [1,2,3,4,5,6,7,8,9,10,11,12,13]].values
Y= df.iloc [:,[0]].values

## Convert to a matrix of token counts

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect= CountVectorizer()
X_counts=count_vect.fit_transform (df['Combo'].values.astype('U'))
print (X_counts.shape)
print (count_vect.get_feature_names()) 
#print (X_counts)

(4985, 50803)


## Applying various Models to find best accuracy

In [6]:
# Compare Algorithms
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
# load dataset
array = df.values
X_t = X_counts
X_t= X_t.toarray()
Y_t = array[:,0]
# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('LR', LogisticRegression(max_iter=200000,solver='lbfgs',multi_class='multinomial')))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('Rand_Forest', RandomForestClassifier()))
models.append(('NB', GaussianNB()))
models.append (('SDG', SGDClassifier()))


# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, shuffle = True, random_state=seed)
	cv_results = model_selection.cross_val_score(model, X_t, Y_t, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)


LR: 0.893081 (0.015506)
KNN: 0.585963 (0.021714)
CART: 0.892881 (0.019008)
Rand_Forest: 0.893883 (0.018744)
NB: 0.866004 (0.016421)
SDG: 0.889069 (0.018401)


## Saving the model

In [0]:
import pickle 
filename = 'finalized_model.sav'
pickle.dump(models[0], open(filename, 'wb'))