In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme(color_codes=True)
#ML libraries
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
#libraries for data vectorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
#libraries for data pre-processing and NLP
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
import re
import string


In [2]:
df_train = pd.read_csv("data/train.csv")
df_train = df_train.drop(columns=['id'])
df_train.head()

Unnamed: 0,title,author,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
df_train = df_train[df_train['text'].notna()]
df_train = df_train[['text','label']]
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20761 entries, 0 to 20799
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    20761 non-null  object
 1   label   20761 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 486.6+ KB


# Preprocessing data #

In [6]:
#function, which will clean the data from punctuations, digits and other invalid characters. 
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\w*\d\w*', '', text).strip() 
    text = re.sub(r'\d+', '', text)
    text = ''.join([i for i in text if i not in string.punctuation])
    return text.split()

to_remove = stopwords.words('english')

#function which will remove from given review all stopwords, which occurs in english language (words like 'in','he,'where','when' etc.).
def remove_stopword(text):
    return np.setdiff1d(text, to_remove)

#function which will perform lemmatization of words in given review, i.e. convert different forms of one word into one form,
#so they can be analysis as single word 
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    output = [lemmatizer.lemmatize(word) for word in text]
    return " ".join(output)
    

In [7]:
#perform preprocessing
df_train['text'] = df_train['text'].apply(lambda x: lemmatize_text(remove_stopword(preprocess(x))))
df_train.head()

Unnamed: 0,text,label
0,ablaze abusive according acting actually advan...,1
1,adam addressed address ago ahead” alinsky amer...,0
2,able academic academy accept accepting accurat...,1
3,abdul able active address afghan afghanistan a...,1
4,according activist adultery anyone” apartment ...,1


In [8]:
vectorizer = CountVectorizer(max_features=5000, min_df=0, max_df=0.7)
X = vectorizer.fit_transform(df_train['text']).toarray()

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, df_train['label'],\
    test_size = 0.2, random_state=12)

In [10]:
from sklearn.svm import LinearSVC

classifier = LinearSVC(dual=False)
classifier.fit(X_train, y_train)

LinearSVC(dual=False)

In [11]:
#Testing our model
y_predict = classifier.predict(X_test)
y_prob = classifier._predict_proba_lr(X_test)[:,1]

In [12]:
#evaluate the model
print(classification_report(y_test,y_predict)) 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

              precision    recall  f1-score   support

           0       0.93      0.94      0.94      2074
           1       0.94      0.93      0.94      2079

    accuracy                           0.94      4153
   macro avg       0.94      0.94      0.94      4153
weighted avg       0.94      0.94      0.94      4153

AUC: 0.9831788983187247
