In [None]:
import pandas as pd

# Reading in the data
train_data = pd.read_csv("data/data_final/train_data.csv")
test_data = pd.read_csv("data/data_final/test_data.csv")
y_train = pd.read_csv("data/data_final/train_y.csv").to_numpy()
y_test = pd.read_csv("data/data_final/test_y.csv").to_numpy()

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

def create_tfidf(analyzer, char_low, char_high, X_train, X_test):
    bow_transform = CountVectorizer(analyzer = analyzer, ngram_range = (char_low, char_high))
    X_train_bow = bow_transform.fit_transform(X_train)
    X_test_bow = bow_transform.transform(X_test)
    tfidf_trfm = TfidfTransformer(norm=None)
    X_train_tfidf = tfidf_trfm.fit_transform(X_train_bow)
    X_test_tfidf = tfidf_trfm.transform(X_test_bow)
    
    return [analyzer, char_low, char_high, X_train_bow, X_test_bow, X_train_tfidf, X_test_tfidf]

In [None]:
analyzer_types = ["word", "char", "char_wb"]
ngram_ranges = [1, 2, 3, 4, 5]

features_list = []

for analyzer in analyzer_types:
    for ngram in ngram_ranges:
        features = create_tfidf(analyzer, ngram, ngram, train_data['clean_text'], test_data['clean_text'])
        features_list.append(features)

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
plt.plot(n_comp, explained)
plt.xlabel('Number of components')
plt.ylabel("Explained Variance")
plt.title("Plot of Number of components v/s explained variance")
plt.show()

In [None]:
from sklearn.decomposition import TruncatedSVD

component_list = []

for feature_set in features_list:
    n_comp = [5,10,15,20,50,100,150,200,500,700,800,900,1000,1500,2000,2500,3000,3500]
    explained = []
    
    for x in n_comp:
        if x > feature_set[5].shape[0]:
            break
        svd = TruncatedSVD(n_components=x)
        svd.fit(feature_set[5])
        explained.append(svd.explained_variance_ratio_.sum())
        if svd.explained_variance_ratio_.sum() > 0.95:
            component_list.append([feature_set[0], feature_set[1], feature_set[2], x])
            break

In [None]:
counter = 0

truncated_features_list = features_list

for feature_set in features_list:
    print(feature_set)
    svd = TruncatedSVD(n_components = component_list[counter][3], n_iter=7, random_state=42)
    svd_x = svd.fit_transform(feature_set[5])
    svd_tr = svd.transform(feature_set[6])
    
    truncated_features_list[counter][5] = svd_x
    truncated_features_list[counter][6] = svd_tr
    
    counter += 1

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

param_list = []

for feature_set in truncated_features_list:
    rf_base = SVC(random_state = 1)


    param_grid = [
      {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
      {'C': [1, 10, 100, 1000], 'degree': [1, 2], 'gamma': [0.1, 0.01, 0.001, 0.0001], 'kernel': ['poly']},
      {'C': [1, 10, 100, 1000], 'degree': [1, 2, 3, 4, 5], 'gamma': [0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
      {'C': [1, 10, 100, 1000], 'gamma': [0.1, 0.01, 0.001, 0.0001], 'kernel': ['sigmoid']}
     ]

    rf_random = GridSearchCV(
        SVC(), param_grid, scoring='f1', cv = 3, verbose = 10, n_jobs = -1
        )

    rf_random.fit(feature_set[5], train_data['Label'])

    param_list.append([feature_set[0], feature_set[1], feature_set[2], rf_random.best_params_])

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

count = 0

result_list = param_list

for hyperparameter in param_list:
    kernel = hyperparameter[3].get('kernel')
    C = hyperparameter[3].get('C')
    gamma = hyperparameter[3].get('gamma')
    degree = hyperparameter[3].get('degree')
    
    if kernel == 'linear':
        test_model = SVC(kernel = 'linear', C = C)
    elif kernel == 'sigmoid':
        test_model = SVC(kernel = 'sigmoid', C = C, gamma = gamma)
    elif kernel == 'rbf':
        test_model = SVC(kernel = 'rbf', C = C, gamma = gamma, degree = degree)
    
    test_model.fit(truncated_features_list[count][5], y_train.ravel())
    y_true, y_pred = y_test.ravel(), test_model.predict(truncated_features_list[count][6])
    
    result_list[count].append(f1_score(y_true, y_pred))
    count += 1

In [None]:
hc_train = pd.read_csv("data/data_final/train_hc.csv")
hc_test = pd.read_csv("data/data_final/test_hc.csv")
liwc_train = pd.read_csv("data/data_final/train_data_liwc.csv")
liwc_test = pd.read_csv("data/data_final/test_data_liwc.csv")

train_df = pd.DataFrame(data = truncated_features_list[9][5])
test_df = pd.DataFrame(data = truncated_features_list[9][6])

comb_train_all = pd.concat([train_df, hc_train, liwc_train], axis = 1)
comb_test_all = pd.concat([test_df, hc_test, liwc_test], axis = 1)

comb_train_hc = pd.concat([train_df, hc_train], axis = 1)
comb_test_hc = pd.concat([test_df, hc_train], axis = 1)

comb_train_liwc = pd.concat([train_df, liwc_train], axis = 1)
comb_test_liwc = pd.concat([test_df, liwc_test], axis = 1)

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.decomposition import TruncatedSVD

test_model = SVC(kernel = 'sigmoid', C = 100, gamma = 0.001)
test_model.fit(truncated_features_list[9][5], y_train.ravel())
y_true, y_pred = y_test.ravel(), test_model.predict(truncated_features_list[9][6])
print(classification_report(y_true, y_pred))

In [None]:
import matplotlib.pyplot as plt
import itertools   
import numpy as np

from sklearn.metrics import confusion_matrix
cf = confusion_matrix(y_true, y_pred, labels=None, sample_weight=None, normalize=None)

plt.imshow(cf,cmap=plt.cm.Blues,interpolation='nearest')
plt.colorbar()
plt.xlabel('Predicted')
plt.ylabel('Actual')
tick_marks = np.arange(len(set(y_pred))) # length of classes
class_labels = ['0','1']
tick_marks
plt.xticks(tick_marks,class_labels)
plt.yticks(tick_marks,class_labels)
# plotting text value inside cells
thresh = cf.max() / 2.
for i,j in itertools.product(range(cf.shape[0]),range(cf.shape[1])):
    plt.text(j,i,format(cf[i,j],'d'),horizontalalignment='center',color='white' if cf[i,j] >thresh else 'black')
plt.show();

In [None]:
hc_train = pd.read_csv("data/data_final/train_hc.csv")
hc_test = pd.read_csv("data/data_final/test_hc.csv")
liwc_train = pd.read_csv("data/data_final/train_data_liwc.csv")
liwc_test = pd.read_csv("data/data_final/test_data_liwc.csv")

train_df = pd.DataFrame(data = truncated_features_list[9][5])
test_df = pd.DataFrame(data = truncated_features_list[9][6])

comb_train_all = pd.concat([train_df, hc_train, liwc_train], axis = 1)
comb_test_all = pd.concat([test_df, hc_test, liwc_test], axis = 1)

comb_train_hc = pd.concat([train_df, hc_train], axis = 1)
comb_test_hc = pd.concat([test_df, hc_train], axis = 1)

comb_train_liwc = pd.concat([train_df, liwc_train], axis = 1)
comb_test_liwc = pd.concat([test_df, liwc_test], axis = 1)

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.decomposition import TruncatedSVD

test_model = SVC(kernel = 'poly', C = 1, gamma = 0.01, degree = 1)
test_model.fit(comb_train_all, y_train.ravel())
y_true, y_pred = y_test.ravel(), test_model.predict(comb_test_all)
print(classification_report(y_true, y_pred))

In [None]:
import matplotlib.pyplot as plt
import itertools   
import numpy as np

from sklearn.metrics import confusion_matrix
cf = confusion_matrix(y_true, y_pred, labels=None, sample_weight=None, normalize=None)

plt.imshow(cf,cmap=plt.cm.Blues,interpolation='nearest')
plt.colorbar()
plt.xlabel('Predicted')
plt.ylabel('Actual')
tick_marks = np.arange(len(set(y_pred))) # length of classes
class_labels = ['0','1']
tick_marks
plt.xticks(tick_marks,class_labels)
plt.yticks(tick_marks,class_labels)
# plotting text value inside cells
thresh = cf.max() / 2.
for i,j in itertools.product(range(cf.shape[0]),range(cf.shape[1])):
    plt.text(j,i,format(cf[i,j],'d'),horizontalalignment='center',color='white' if cf[i,j] >thresh else 'black')
plt.show();