## Buzz Prediction on Twitter (Classification Models)
Objective of this notebook:

- Applied the below classification tasks to the data:
    - KNN Classifier
    - Linear Support Vector Machine
    - Decision Tree

- Justified about the good evaluation startegy for this datatset.
- Used Grid Search in case the model has scaling parameter(s) to find the best scaling paramter.
- Used cross-validation to find the average training and testing score.
- Compared the results from above models and came out with the best classifier for this dataset.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

: 

In [None]:
data = pd.read_csv('./dataset/Twitter-Absolute-Sigma-500.data')

: 

In [None]:
# Rename columns
data.columns = ['NCD_0','NCD_1','NCD_2','NCD_3','NCD_4','NCD_5','NCD_6','AI_0','AI_1','AI_2','AI_3','AI_4','AI_5','AI_6',
               'AS(NA)_0','AS(NA)_1','AS(NA)_2','AS(NA)_3','AS(NA)_4','AS(NA)_5','AS(NA)_6','BL_0','BL_1','BL_2','BL_3',
                'BL_4','BL_5','BL_6','NAC_0','NAC_1','NAC_2','NAC_3','NAC_4','NAC_5','NAC_6','AS(NAC)_0','AS(NAC)_1',
               'AS(NAC)_2','AS(NAC)_3','AS(NAC)_4','AS(NAC)_5','AS(NAC)_6','CS_0','CS_1','CS_2','CS_3','CS_4','CS_5','CS_6',
               'AT_0','AT_1','AT_2','AT_3','AT_4','AT_5','AT_6','NA_0','NA_1','NA_2','NA_3','NA_4','NA_5','NA_6','ADL_0',
               'ADL_1','ADL_2','ADL_3','ADL_4','ADL_5','ADL_6','NAD_0','NAD_1','NAD_2','NAD_3','NAD_4','NAD_5','NAD_6','Target']

: 

In [None]:
data.head()

: 

In [None]:
data.shape

: 

In [None]:
data.describe()

: 

In [None]:
data.info()

: 

In [None]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]
print(X.shape)
print(y.shape)

from  sklearn.model_selection import train_test_split
_, sample_data, _, sample_target = train_test_split(X, y, shuffle = True, test_size = 0.1)

: 

### Explore Dataset

In [None]:
y.sum()/y.count()

: 

Class Distribution: -- Positives instances (ie. Buzz) : 27775 (19 %) -- Negative instances (ie. Non Buzz) : 112931 (81 %)

It can be clearly seen that this is an imbalanced dataset. Here we have to minimize both Type I and Type II errors, so we have used F1 Score as the evaluation metrics for this classification task.

Also as the data points in the dataset is not normally distributed. Hence we have used MinMax Scaler.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

X_train_org, X_test_org, y_train, y_test = train_test_split(sample_data,sample_target, random_state = 0)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_org)
X_test = scaler.transform(X_test_org)

: 

### 1. KNN Classsifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
param_grid = {'n_neighbors':[1, 5, 10, 15, 20]}

grid_knn = GridSearchCV(knn, param_grid=param_grid, cv=5, scoring='f1', return_train_score=True)
grid_knn.fit(X_train, y_train)
grid_knn.score(X_train, y_train)

: 

In [None]:
grid_knn.score(X_test, y_test)

: 

In [None]:
grid_knn.best_params_

: 

In [None]:
grid_knn.cv_results_['mean_test_score']

: 

In [None]:
y_knn_predict = grid_knn.predict(X_test)
y_knn_train_predict = grid_knn.predict(X_train)

: 

In [None]:
print('Train F1 Score: %.4f'%f1_score(y_knn_train_predict, y_train))
print('Test F1 Score: %.4f '%f1_score(y_knn_predict, y_test))

: 

In [None]:
cv_results_knn = pd.DataFrame.from_dict(grid_knn.cv_results_)
cv_results_knn[['param_n_neighbors','mean_train_score','mean_test_score']]

: 

In [None]:
x_axis = cv_results_knn['param_n_neighbors']
%matplotlib inline
plt.plot(x_axis, cv_results_knn['mean_train_score'], label = 'Mean Train Score', c = 'r', marker='o', linestyle='-')
plt.plot(x_axis, cv_results_knn['mean_test_score'], label = 'Mean Test Score', c='b', marker='o', linestyle='-')
plt.xlabel('k')
plt.ylabel('Accuracy')
plt.legend()

: 

In [None]:
report_table = [['KNN', 'k = 5', grid_knn.score(X_train, y_train), grid_knn.score(X_test, y_test), f1_score(y_knn_train_predict, y_train), f1_score(y_knn_predict, y_test) ]]

: 

### 2. Linear SVC

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

svc_linear = SVC(kernel = 'linear') 
param_grid = {'C':[0.01, 0.1, 1, 10]}
print("Parameter grid for Linear SVC: {}".format(param_grid))

grid_svcLinear = GridSearchCV(svc_linear, param_grid = param_grid, cv = 5, n_jobs = -1, scoring='f1', return_train_score=True)

: 

In [None]:
grid_svcLinear.fit(X_train, y_train)

: 

In [None]:
cv_results_lsvc = pd.DataFrame.from_dict(grid_svcLinear.cv_results_)
cv_results_lsvc[['param_C','mean_train_score','mean_test_score']]

: 

In [None]:
x_axis = cv_results_lsvc['param_C']
plt.plot(x_axis, cv_results_lsvc['mean_train_score'], c = 'r', label = 'Mean Train Score', marker='o', linestyle='-')
plt.plot(x_axis, cv_results_lsvc['mean_test_score'], c = 'b', label = 'Mean Test Score', marker='o', linestyle='-')
plt.legend()
plt.title('Scores from 5 fold cross validation')
plt.xlabel('C')
plt.ylabel('Accuracy')
plt.xscale('log')

: 

In [None]:
print("Best cross-validation score: {:.4f}".format(grid_svcLinear.best_score_))
print('Best penalty term:',grid_svcLinear.best_params_)
print("Training Score: {:.4f}".format(grid_svcLinear.score(X_train, y_train)))
print("Testing Score: {:.4f}".format(grid_svcLinear.score(X_test, y_test)))

: 

In [None]:
y_svcLinear_predict_train = grid_svcLinear.predict(X_train)
y_svcLinear_predict = grid_svcLinear.predict(X_test)

: 

In [None]:
report_table = report_table + [['Linear SVC', 'C = 10', grid_svcLinear.score(X_train, y_train), grid_svcLinear.score(X_test, y_test), f1_score(y_svcLinear_predict_train, y_train), f1_score(y_svcLinear_predict, y_test)]]

: 

### 3. Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

param_grid = {'max_depth':[1, 2, 3, 4, 5, 6]}
dtree = DecisionTreeClassifier()

grid_tree = GridSearchCV(dtree, param_grid, cv = 5, scoring='f1', return_train_score=True)
grid_tree.fit(X_train, y_train)

: 

In [None]:
grid_tree.best_params_

: 

In [None]:
y_dtree_predict_train = grid_tree.predict(X_train)
y_dtree_predict = grid_tree.predict(X_test)

: 

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(grid_tree.cv_results_['param_max_depth'], grid_tree.cv_results_['mean_train_score'], c='r', label='Mean Train Score', marker='o', linestyle='-')
plt.plot(grid_tree.cv_results_['param_max_depth'], grid_tree.cv_results_['mean_test_score'], c='b', label='Mean Test Score', marker='o', linestyle='-')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.title('Decision Tree: Scores from 5-Fold Cross Validation')
plt.legend()
plt.grid(True)
plt.show()

: 

In [None]:
report_table = report_table + [['Decision Tree', 'max_depth = 4', grid_tree.score(X_train, y_train), grid_tree.score(X_test, y_test), f1_score(y_dtree_predict_train, y_train), f1_score(y_dtree_predict, y_test)]]

: 

In [None]:
report = pd.DataFrame(report_table,columns = ['Model name', 'Model parameter', 'Train accuracy', 'Test accuracy', 'Train F1 score', 'Test F1 score'])
report.index = report['Model name']
report

: 

In [None]:
# Plotting the Mean Test Scores for KNN, Linear SVC, and Decision Tree
plt.figure(figsize=(10, 6))

plt.plot(cv_results_knn['param_n_neighbors'], cv_results_knn['mean_test_score'], label='KNN', marker='o', linestyle='-')

plt.plot(cv_results_lsvc['param_C'], cv_results_lsvc['mean_test_score'], label='Linear SVC', marker='o', linestyle='-')

plt.plot(grid_tree.cv_results_['param_max_depth'], grid_tree.cv_results_['mean_test_score'], label='Decision Tree', marker='o', linestyle='-')

plt.xlabel('Parameter Value')
plt.ylabel('Mean Test Score')
plt.title('Comparison of Classification Algorithms')
plt.legend()
plt.grid(True)
plt.show()

: 