In [36]:
import pandas as pd
import numpy as np
import string
import pickle

#sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans

In [3]:
dataset = pd.read_csv('data.csv')
dataset = dataset.dropna()
dataset

Unnamed: 0,Text,category
0,modi promised minimum government maximum gover...,-1.0
1,talk nonsense continue drama vote modi,0.0
2,say vote modi welcome bjp told rahul main camp...,1.0
3,asking supporters prefix chowkidar names modi ...,1.0
4,answer among powerful world leader today trump...,1.0
...,...,...
162975,crores paid neerav modi recovered congress lea...,-1.0
162976,dear rss terrorist payal gawar modi killing pl...,-1.0
162977,cover interaction forum left,0.0
162978,big project came india modi dream project happ...,0.0


# Training data

In [5]:
X = dataset['Text']
y = dataset['category']

In [6]:
#tfidfVectorizer
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size = 0.2, random_state=42)

In [8]:
features = vectorizer.get_feature_names_out()
features

array(['aa', 'aaa', 'aaaa', ..., 'zyadamodi', 'zyonist', 'zzz'],
      dtype=object)

# RandomForestClassifier

In [9]:
rfc = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)

In [10]:
rfc_pred = rfc.predict(X_test)

In [11]:
print("Accuracy of model : ", accuracy_score(y_test, rfc_pred))
#Error rate
print("Error rate of the RandomForest", 1 - accuracy_score(y_test, rfc_pred))

Accuracy of model :  0.8071457073575002
Error rate of the RandomForest 0.19285429264249976


# LinearSVC

In [12]:
clf = LinearSVC(random_state=42).fit(X_train, y_train)

In [13]:
y_svm = clf.predict(X_test)

In [14]:
print("Accuracy of model : ", accuracy_score(y_test, y_svm))
#Error rate
print("Error rate of the RandomForest", 1 - accuracy_score(y_test, y_svm))

Accuracy of model :  0.8993523435341785
Error rate of the RandomForest 0.10064765646582152


# DecisionTreeClassifier

In [20]:
dtree = DecisionTreeClassifier(max_depth=10, random_state = 42, 
                              max_features=None, min_samples_leaf = 30).fit(X_train, y_train)

In [18]:
dtree_pred = dtree.predict(X_test)

In [21]:
print("Accuracy of model : ", accuracy_score(y_test, dtree_pred))
#Error rate
print("Error rate of the RandomForest", 1 - accuracy_score(y_test, dtree_pred))

Accuracy of model :  0.5411461370821695
Error rate of the RandomForest 0.4588538629178305


# LogisticRegression

In [22]:
from sklearn.linear_model import LogisticRegression

In [23]:
model = LogisticRegression()

In [24]:
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [25]:
model.score(X_train, y_train)

0.9157112819804473

In [26]:
y_pred = model.predict(X_test)

In [27]:
print("Accuracy of model : ", accuracy_score(y_test, y_pred))
#Error rate
print("Error rate of the RandomForest", 1 - accuracy_score(y_test, y_pred))

Accuracy of model :  0.8825316921943583
Error rate of the RandomForest 0.11746830780564166


# KMeans

In [33]:
kmeans = KMeans(n_clusters=10, random_state=0).fit(X_train)

In [34]:
y_kmeans = kmeans.predict(X_test)

In [35]:
print("Accuracy of model : ", accuracy_score(y_test, y_kmeans))
#Error rate
print("Error rate of the RandomForest", 1 - accuracy_score(y_test, y_kmeans))

Accuracy of model :  0.03784646551459529
Error rate of the RandomForest 0.9621535344854047


# KNearestNeighbors

In [37]:
knn = KNeighborsClassifier(n_neighbors=10).fit(X_train, y_train)

In [38]:
knn_pred = pd.Series(knn.predict(X_test))
knn_pred.value_counts()

 0.0    23585
 1.0     6495
-1.0     2499
dtype: int64

In [39]:
knn_pred = knn.predict(X_test)

In [41]:
print("Accuracy of model : ", accuracy_score(y_test, knn_pred))
#Error rate
print("Error rate of the RandomForest", 1 - accuracy_score(y_test, knn_pred))

Accuracy of model :  0.521317413057491
Error rate of the RandomForest 0.478682586942509


# Joblib

In [40]:
import joblib

In [43]:
joblib.dump(clf, 'text_mining.joblib')

['text_mining.joblib']

In [44]:
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [45]:
joblib.dump(clf, 'prediction.pkl')

['prediction.pkl']