In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import gensim
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
import gensim.downloader as api
from sklearn.model_selection import GridSearchCV
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#### Loading the google news model is necessary for this project

In [10]:
# Load the gensim's google news model

wv = api.load('word2vec-google-news-300')

#### Loading spacy's large english model 

In [12]:
# Load Spacy's large english language model
nlp = spacy.load('en_core_web_lg')

In [31]:
df = pd.read_csv("Dataset/Classification/headline_data.csv")
df.head()

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated
4,Spider burrowed through tourist's stomach and ...,1923,disagree


In [32]:
# What is the shape of the data?

df.shape

(75385, 3)

- There are about 75,000 records. This is quite large enough to give a good accuracy

In [33]:
# What do the headline contents look like?

df.Headline.iloc[20:31]

20    Tropical spider burrows under man's skin throu...
21    Boko Haram ceasefire ignored as violence flare...
22    NBC's Tom Brokaw reportedly wants Brian Willia...
23    Would you take a bite out of the world's oldes...
24    NET Extra: Back-from-the-dead Catholic priest ...
25    Rumor debunked: RoboCop-style robots are not p...
26    Luke Somers Dies In Rescue Attempt, Sister Say...
27    Christian Bale Exits Steve Jobs Movie (Exclusive)
28    Microsoft Tried Out Robot Security Guards on I...
29    Report: Christian Bale Just Bailed on the Stev...
30    Islamic Militants Post Video Claiming to Show ...
Name: Headline, dtype: object

In [34]:
df.Headline.iloc[20:31][27]

'Christian Bale Exits Steve Jobs Movie (Exclusive)'

In [35]:
# Checking for missing values 

df.isnull().sum()

Headline    0
Body ID     0
Stance      0
dtype: int64

- No missing values in this data

In [36]:
# Checking for duplicates

df.duplicated().sum()

644

- 644 duplicates? We have to drop them!

In [37]:
# We have the duplicates
df = df.drop_duplicates()

In [38]:
df.duplicated().sum(), df.shape

(0, (74741, 3))

- Dropped, we now have about 74,700 records, so we good to go

In [39]:
# The body ID is not important for this project, so it has to be dropped

df = df.drop('Body ID', axis=1)
df.head()

Unnamed: 0,Headline,Stance
0,Police find mass graves with at least '15 bodi...,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,agree
2,"Christian Bale passes on role of Steve Jobs, a...",unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,unrelated
4,Spider burrowed through tourist's stomach and ...,disagree


In [40]:
# The Stance values need to be label encode, so we can be able to work with them
df.Stance.unique()

array(['unrelated', 'agree', 'disagree', 'discuss'], dtype=object)

In [41]:
df['Stance'].replace({'agree': 1,'disagree': 2, 'discuss': 3, 'unrelated': 4}, inplace=True)
df.head()

Unnamed: 0,Headline,Stance
0,Police find mass graves with at least '15 bodi...,4
1,Hundreds of Palestinians flee floods in Gaza a...,1
2,"Christian Bale passes on role of Steve Jobs, a...",4
3,HBO and Apple in Talks for $15/Month Apple TV ...,4
4,Spider burrowed through tourist's stomach and ...,2


In [42]:
# The Headline column needs to also be vectorized
# A function to preprocess and vectorize the texts should be okay

def preprocess_and_vectorize(text):
    doc = nlp(text)
    
    filtered_tokens = []
    for token in  doc:
        if token.is_punct or token.is_stop:
            continue
        filtered_tokens.append(token.lemma_)
    return wv.get_mean_vector(filtered_tokens)

In [43]:
df['headline_vect'] = df['Headline'].apply(lambda text: preprocess_and_vectorize(text))

In [44]:
df.head()

Unnamed: 0,Headline,Stance,headline_vect
0,Police find mass graves with at least '15 bodi...,4,"[0.029742045, 0.0135429725, 0.0433583, -0.0003..."
1,Hundreds of Palestinians flee floods in Gaza a...,1,"[0.043757837, -0.0018107556, -0.020637596, 0.0..."
2,"Christian Bale passes on role of Steve Jobs, a...",4,"[0.07894319, 0.004406103, -0.030460116, -0.024..."
3,HBO and Apple in Talks for $15/Month Apple TV ...,4,"[-0.021333748, 0.0114985565, -0.024023581, 0.0..."
4,Spider burrowed through tourist's stomach and ...,2,"[0.036587663, 0.03131389, -0.022689383, 0.0119..."


In [45]:
df.dtypes

Headline         object
Stance            int64
headline_vect    object
dtype: object

In [46]:
df.headline_vect.sample(20)

65534    [-0.06649462, 0.013062145, -0.048454307, 0.024...
28360    [0.0046069985, 0.014208828, 0.0054771015, 0.04...
30226    [0.033261463, 0.021965234, 0.06343346, 0.06153...
16779    [0.003960216, 0.0053887255, 0.009690851, 0.046...
49474    [0.020533083, 0.002747034, 0.028274242, 0.0345...
3693     [0.029753555, 0.039555762, -0.018220456, 0.046...
34202    [-0.017287137, 0.00743858, -0.001824721, 0.025...
8087     [0.005232451, -0.027328089, -0.011303009, 0.01...
45213    [0.013866857, -0.012832581, 0.039726797, 0.065...
36572    [0.0110958945, 0.019324113, -0.018801127, 0.01...
55481    [-0.014474019, -0.008579992, -0.047507603, 0.0...
50346    [-0.005872951, 0.021830097, -0.010244614, 0.02...
68624    [0.006551439, 0.016327824, 0.0031818582, 0.010...
14476    [0.026897302, 0.023907341, 0.021463612, 0.0285...
21457    [-0.0009906583, -0.008120957, 0.009905365, 0.0...
30779    [-0.0003287983, 0.023703804, -0.019402778, 0.0...
55842    [-0.012106235, 0.014858723, 0.01503253, 0.0303.

In [47]:
df.Stance.value_counts()

Stance
4    54446
3    13237
1     5532
2     1526
Name: count, dtype: int64

- There is class imbalance
- we  will have to oversample classes 1,2,3 and undersample class 4 to 10,000


### Preprocessing

In [48]:
## Splitting the dataset into thye feature(s) and target

X = df['headline_vect']
y = df['Stance']

In [49]:
## There is class imbalance in the target column
## Oversampling the classes on 12,000

sample_num = 15000

df_agree = df[df.Stance == 1].sample(sample_num, random_state=121, replace=True)
df_disagree = df[df.Stance == 2].sample(sample_num, random_state=121, replace=True)
df_unrelated = df[df.Stance == 4].sample(sample_num, random_state=121)
df_discuss = df[df.Stance == 3].sample(sample_num, random_state=121, replace=True)


# Using concat function to concatenate the classes row-rise

df_balanced = pd.concat([df_agree, df_disagree, df_unrelated, df_discuss], axis=0)
df_balanced.head()

Unnamed: 0,Headline,Stance,headline_vect
46608,An Indian Civil Servant Just Got Sacked After ...,1,"[-0.0056338636, 0.012650628, -0.019290624, 0.0..."
62784,"“I am lost for words,” 5-year-old boy billed f...",1,"[0.05137522, 0.0036255426, 0.02222711, 0.03495..."
22861,"No, A Spider Did Not Burrow Under A Man's Skin...",1,"[-8.1885606e-05, 0.044481948, -0.014884096, 0...."
48950,Axl Rose Found Dead of Sh*tty Hoax Website at ...,1,"[0.06331376, -0.00691864, -0.038948596, 0.0112..."
29057,Michael Phelps’ girlfriend Taylor Lianne Chand...,1,"[0.023934864, -0.010345539, -0.040727712, -0.0..."


In [50]:
df_balanced.Stance.value_counts()

Stance
1    15000
2    15000
4    15000
3    15000
Name: count, dtype: int64

In [51]:
X = df_balanced['headline_vect'].values
y = df_balanced['Stance'].values

In [52]:
## Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=121)

In [53]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((48000,), (12000,), (48000,), (12000,))

In [54]:
X_train = np.array(X_train)

In [55]:
X_train = np.stack(X_train)
X_train

array([[ 0.03805136,  0.02114049, -0.041832  , ..., -0.02060655,
         0.05296854,  0.01858594],
       [-0.00503733, -0.00733214,  0.00400608, ..., -0.01834936,
        -0.01215025, -0.00023521],
       [ 0.03169822,  0.00963411,  0.02370399, ...,  0.03205358,
         0.02881443,  0.03334656],
       ...,
       [ 0.00136664, -0.00312488, -0.02287656, ..., -0.04825032,
         0.02974315, -0.02987019],
       [-0.01213469,  0.07464661, -0.00328022, ..., -0.02966163,
         0.00625687,  0.03011782],
       [-0.01535544,  0.05845592, -0.02950395, ..., -0.00375087,
        -0.02238054,  0.00533413]], dtype=float32)

In [56]:
X_test = np.stack(X_test)
X_test

array([[-0.01400317,  0.01431732, -0.01145621, ...,  0.00765424,
        -0.05022866,  0.00969823],
       [-0.03565037, -0.00888677,  0.01411753, ..., -0.03753102,
        -0.002705  ,  0.02117172],
       [-0.01011314,  0.01783596,  0.01642643, ..., -0.00716076,
         0.01727   ,  0.00548617],
       ...,
       [ 0.04382846, -0.0082768 , -0.00841844, ...,  0.01450387,
         0.0251844 ,  0.0020925 ],
       [ 0.00325275,  0.02488003, -0.00076931, ..., -0.010082  ,
         0.03152272, -0.04391581],
       [ 0.01925507,  0.00118381,  0.02931359, ..., -0.04797569,
         0.0233765 ,  0.03139329]], dtype=float32)

- These values are both negative and positive values
- It is necessary to employ the scaling method
- The MinMax Scaler is best suited for this

In [60]:
from sklearn.preprocessing import MinMaxScaler

In [61]:
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)

In [62]:
X_test_scaled = scaler.transform(X_test)

### Model Building

In [63]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

### Using the MultinomialNB classifier

In [64]:
naive_clf  = MultinomialNB()
naive_clf.fit(X_train_scaled, y_train)

In [65]:
y_train_pred = naive_clf.predict(X_train_scaled)

print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           1       0.44      0.42      0.43     12013
           2       0.46      0.50      0.48     11956
           3       0.41      0.54      0.47     11961
           4       0.30      0.19      0.23     12070

    accuracy                           0.41     48000
   macro avg       0.40      0.41      0.40     48000
weighted avg       0.40      0.41      0.40     48000



#### NAIVE BAYES EVALUATION

In [66]:
# Evaluation
y_pred = naive_clf.predict(X_test_scaled)

print('Test Accuracy:')
print(classification_report(y_test, y_pred))

Test Accuracy:
              precision    recall  f1-score   support

           1       0.44      0.43      0.43      2987
           2       0.48      0.50      0.49      3044
           3       0.41      0.53      0.46      3039
           4       0.28      0.19      0.22      2930

    accuracy                           0.41     12000
   macro avg       0.40      0.41      0.40     12000
weighted avg       0.40      0.41      0.40     12000



### Using Random Forest Classifier

In [67]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train_scaled, y_train)

In [68]:
y_train_pred = rf_clf.predict(X_train_scaled)

print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           1       0.62      0.66      0.64     12013
           2       0.64      0.91      0.75     11956
           3       0.67      0.62      0.64     11961
           4       0.63      0.38      0.47     12070

    accuracy                           0.64     48000
   macro avg       0.64      0.64      0.63     48000
weighted avg       0.64      0.64      0.63     48000



#### RANDOM FOREST EVALUATION

In [69]:
y_pred = rf_clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.58      0.63      0.60      2987
           2       0.64      0.89      0.74      3044
           3       0.64      0.57      0.60      3039
           4       0.54      0.34      0.42      2930

    accuracy                           0.61     12000
   macro avg       0.60      0.60      0.59     12000
weighted avg       0.60      0.61      0.59     12000



### Using KNN

In [70]:
knn_clf = KNeighborsClassifier()

knn_clf.fit(X_train_scaled, y_train)

In [71]:
y_train_pred = knn_clf.predict(X_train_scaled)

print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           1       0.55      0.66      0.60     12013
           2       0.65      0.80      0.71     11956
           3       0.62      0.57      0.59     11961
           4       0.55      0.37      0.44     12070

    accuracy                           0.60     48000
   macro avg       0.59      0.60      0.59     48000
weighted avg       0.59      0.60      0.59     48000



#### KNN EVALUATION

In [72]:
y_pred = knn_clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.53      0.64      0.58      2987
           2       0.65      0.78      0.71      3044
           3       0.59      0.52      0.55      3039
           4       0.48      0.33      0.39      2930

    accuracy                           0.57     12000
   macro avg       0.56      0.57      0.56     12000
weighted avg       0.56      0.57      0.56     12000



### Using SVC

In [73]:
svc_clf = SVC()
svc_clf.fit(X_train_scaled, y_train)

In [74]:
y_train_pred = svc_clf.predict(X_train_scaled)
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           1       0.61      0.66      0.64     12013
           2       0.64      0.90      0.75     11956
           3       0.66      0.62      0.64     11961
           4       0.63      0.35      0.45     12070

    accuracy                           0.63     48000
   macro avg       0.63      0.63      0.62     48000
weighted avg       0.63      0.63      0.62     48000



#### SVC EVALUATIO0N

In [124]:
y_pred = svc_clf.predict(X_test_scaled)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.58      0.64      0.61      2987
           2       0.64      0.89      0.74      3044
           3       0.63      0.56      0.59      3039
           4       0.55      0.32      0.40      2930

    accuracy                           0.61     12000
   macro avg       0.60      0.60      0.59     12000
weighted avg       0.60      0.61      0.59     12000



### Using Gradient Boosting Classifier

In [75]:
gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_train_scaled, y_train)

In [76]:
y_pred_train = gb_clf.predict(X_train_scaled)
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           1       0.59      0.60      0.60     12013
           2       0.64      0.84      0.73     11956
           3       0.58      0.62      0.60     11961
           4       0.54      0.33      0.41     12070

    accuracy                           0.60     48000
   macro avg       0.59      0.60      0.58     48000
weighted avg       0.59      0.60      0.58     48000



#### GB EVALUATION

In [77]:
y_pred = gb_clf.predict(X_test_scaled)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.57      0.58      0.57      2987
           2       0.63      0.83      0.72      3044
           3       0.54      0.55      0.55      3039
           4       0.47      0.30      0.36      2930

    accuracy                           0.57     12000
   macro avg       0.55      0.56      0.55     12000
weighted avg       0.55      0.57      0.55     12000



- Among the 5 classifiers, Random Forest performed better
- The training accuracy is 64% while test accuracy is 61%
- So Random Forest it is!

### SAVING THE MODEL

In [78]:
import pickle

In [79]:
filename = 'headline_stance.pkl'
with open(filename, 'wb') as file:
    pickle.dump(rf_clf, file)

print('Model has been saved successfully!')

Model has been saved successfully!
