### DO NOT RUN BELOW CELL IF YOU ARE RUNNING IT IN LOCAL MACHINE

In [1]:
import os
from google.colab import drive
drive.mount('/content/gdrive')
base_dir = './gdrive/My Drive/MS/Sem2/ADS/ADS Assignment 2'
os.chdir(base_dir)
!ls

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
1preprocessed_data.csv	      7data_with_tprox_features_final.csv
2bow_features_extracted.csv   ADS_A2.ipynb
3data_with_bow_features.csv   Assignment#2.ppt
4sent_features.csv	      train_bodies.csv
5data_with_sent_features.csv  train_stances.csv
6sent_features.csv


### ========================================================================================

# Load Libraries

In [2]:
!pip install autocorrect

import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
import nltk
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.corpus import wordnet as wn
from string import punctuation
from nltk.stem import WordNetLemmatizer 
from autocorrect import spell
  
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')


words_corpus = nltk.corpus.words.words()

lemm = WordNetLemmatizer()
ps = PorterStemmer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Load Dataset

In [0]:
train_bodies = pd.read_csv('train_bodies.csv', encoding='latin-1', skiprows=1)
train_stances = pd.read_csv('train_stances.csv', encoding='latin-1', skiprows=1)

train_stances.columns = ['headline','body_id','stance']
train_bodies.columns = ['body_id','article_body']

In [0]:
train_bodies.shape, train_stances.shape

((1685, 2), (49974, 3))

In [0]:
train_bodies = train_bodies.dropna()
train_stances = train_stances.dropna()
train_bodies['body_id'] = train_bodies.body_id.astype(int)
train_stances['body_id'] = train_stances.body_id.astype(int)

In [0]:
data = train_stances.merge(train_bodies, how='inner', on=['body_id']).drop_duplicates()

# Data Preprocessing

In [0]:
def preprocess_sentences(s):
  s = s.lower().encode(encoding='ASCII',errors='ignore').decode()
  stop_words_set = list(stopwords.words('english'))
  sent_tokenized = list(nltk.word_tokenize(s))
  punctuations = list(punctuation)
  sent_tokenized =  ['' if w in stop_words_set+punctuations else ps.stem(lemm.lemmatize(w)) for w in sent_tokenized]
  

  while True:
    try:
      sent_tokenized.remove('')
    except:
      break
   
  return ' '.join(sent_tokenized).strip()

In [0]:
data.headline = data.headline.apply(lambda x: preprocess_sentences(x))
data.article_body = data.article_body.apply(lambda x: preprocess_sentences(x))

In [0]:
data.to_csv('1preprocessed_data.csv', index=False)

# Features Extraction

In [0]:
data = pd.read_csv('1preprocessed_data.csv', index_col=False)
data.shape

(49570, 4)

## Bag of Words

In [0]:
# Bag of Words (BOW)
def BOW(headline, article_body):
  
  """
  Input:  Headline & Article Body
  Return: [No of Common Words, No. of Uncommon Words, Frequency of Common Words]
  """
  
  article = nltk.word_tokenize(article_body)
  headline = nltk.word_tokenize(headline)

  common_words = np.intersect1d(headline, article)

  not_common_in_head = np.unique(np.setdiff1d(headline, common_words))
  not_common_in_article = np.unique(np.setdiff1d(article, common_words))
  not_common_len = not_common_in_head.shape[0] + not_common_in_article.shape[0]

  article_series = pd.Series(article)
  article_freq = article_series[article_series.isin(common_words)].value_counts().sum()

  headline_series = pd.Series(headline)
  headline_freq = headline_series[headline_series.isin(common_words)].value_counts().sum()

  
  return [common_words.shape[0], not_common_len, article_freq+headline_freq]

In [0]:
bow_feature_cols = ['total_common','total_uncommon','total_common_freq']
bow_features = pd.DataFrame(data.apply(lambda x: BOW(x['headline'], x['article_body']), axis=1).values.tolist(), columns=bow_feature_cols)
bow_features.to_csv('2bow_features_extracted.csv', index=False)

In [0]:
# Combine new features with old ones
data = pd.concat([data, bow_features], axis=1)
data.to_csv('3data_with_bow_features.csv', index=False)

## Sentiment based features

In [0]:
data = pd.read_csv('3data_with_bow_features.csv', index_col=False)
data.shape, data.columns

((49570, 7),
 Index(['headline', 'body_id', 'stance', 'article_body', 'total_common',
        'total_uncommon', 'total_common_freq'],
       dtype='object'))

In [0]:
from nltk.corpus import sentiwordnet as swn
nltk.download('sentiwordnet')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...




True

In [0]:
def sentiment_based_features(headline,  article_body):

  """
  Input:  Headline & Article Body
  Return: [sentiment score, total positives, total_negatives, positive score, negative score, positive to negative ratio]
  """

  article = nltk.word_tokenize(article_body)
  headline = nltk.word_tokenize(headline)

  common_words = np.intersect1d(headline, article)
  if common_words.shape[0] < 1:
    return [0, 0, 0, 0, 0, 0]

  sid = SentimentIntensityAnalyzer()
  sentiment_values = pd.DataFrame(pd.Series(common_words).apply(lambda x: sid.polarity_scores(x)).values.tolist())

  score = sentiment_values['compound'].sum()
  total_pos = (sentiment_values['pos'] > 0).sum()
  pos_score = sentiment_values['pos'].sum()
  total_neg = (sentiment_values['neg'] > 0).sum()
  neg_score = sentiment_values['neg'].sum()

  pos_ratio = 0
  if total_pos+total_neg > 0:
    pos_ratio = float(total_pos)/(total_pos+total_neg)


  return [score, total_pos, total_neg, pos_score, neg_score, pos_ratio]

In [0]:
sent_features_cols = ['sent_score','total_pos','total_neg','pos_score','neg_score','pos_neg_ratio']
sent_features = pd.DataFrame(data.apply(lambda x: sentiment_based_features(x['headline'], x['article_body']), axis=1).values.tolist(), columns=sent_features_cols)

In [0]:
sent_features.to_csv('4sent_features.csv', index=False)

In [0]:
# Combine new features with old ones
data = pd.concat([data, sent_features], axis=1)
data.to_csv('5data_with_sent_features.csv', index=False)

## Term Proximity Features

In [0]:
data = pd.read_csv('5data_with_sent_features.csv', index_col=False)
data.shape, data.columns

((49570, 13),
 Index(['headline', 'body_id', 'stance', 'article_body', 'total_common',
        'total_uncommon', 'total_common_freq', 'sent_score', 'total_pos',
        'total_neg', 'pos_score', 'neg_score', 'pos_neg_ratio'],
       dtype='object'))

In [0]:

def term_proximity_features(headline,  article_body):
  
  """
  Input:  Headline & Article Body
  Return: [total distance, average distance, minimum distance, maximum distance]
  """

  # article_body = data.article_body.iloc[10]
  # headline = data.headline.iloc[10]

  article_tokenized = nltk.word_tokenize(article_body)
  headline_tokenized = nltk.word_tokenize(headline)

  common_words = list(np.intersect1d(headline_tokenized, article_tokenized))
  if len(common_words) < 2:
    return [10000, 10000, 10000, 10000]

  headline_dists = [headline_tokenized.index(w) for w in common_words]
  article_dists = [article_tokenized.index(w) for w in common_words]

  positions_df = pd.DataFrame([headline_dists, article_dists]).T
  positions_df.columns = ['loc_in_head','loc_in_article']
  positions_df.index = common_words
  positions_df = (positions_df - positions_df.mean()) / (positions_df.max() - positions_df.min())

  distances = np.sqrt((positions_df.loc_in_head - positions_df.loc_in_article)**2)
  total_distance = distances.sum()
  avg_distance = total_distance/distances.shape[0]

  return [total_distance, avg_distance, distances.min(), distances.max()]

In [0]:
tprox_features_cols = ['total_distance','avg_distance','min_distance','max_distance']
tprox_features = pd.DataFrame(data.apply(lambda x: term_proximity_features(x['headline'], x['article_body']), axis=1).values.tolist(), columns=tprox_features_cols)

In [0]:
tprox_features.to_csv('6sent_features.csv', index=False)

In [0]:
# Combine new features with old ones
data = pd.concat([data, tprox_features], axis=1)
data.to_csv('7data_with_tprox_features_final.csv', index=False)

In [0]:
data.shape

(49570, 17)

# Implementation of Algorithms - Learning Models

In [3]:
data = pd.read_csv('7data_with_tprox_features_final.csv', index_col=False)
data.shape, data.columns

((49570, 17),
 Index(['headline', 'body_id', 'stance', 'article_body', 'total_common',
        'total_uncommon', 'total_common_freq', 'sent_score', 'total_pos',
        'total_neg', 'pos_score', 'neg_score', 'pos_neg_ratio',
        'total_distance', 'avg_distance', 'min_distance', 'max_distance'],
       dtype='object'))

In [0]:
feat = ['total_common',
        'total_uncommon', 'total_common_freq', 'sent_score', 'total_pos',
        'total_neg', 'pos_score', 'neg_score', 'pos_neg_ratio',
        'total_distance', 'avg_distance', 'min_distance', 'max_distance']
label = ['stance']

In [0]:
# data[feat].mean()

## Normalizing data (Min-Max Normalization)

In [0]:
norm_data = (data[feat]-data[feat].mean())/(data[feat].max() - data[feat].min())
norm_data.shape

In [0]:
#@title Default title text


(49570, 13)

## Encoding Labels (i.e. converting to numeric)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(data[label].values)
data[label] = list(le.transform(data[label].values))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([3, 2, 3, ..., 0, 2, 2])

## Train Test Split (70-30 ratio)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(norm_data, data[label].stance.values , test_size=0.30, random_state=42, stratify = data[label].stance.values)

In [23]:
X_train.shape, X_test.shape

((34699, 13), (14871, 13))

## K Nearest Neighbor

In [0]:
from sklearn.neighbors import KNeighborsClassifier

In [13]:
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train)
knn_model.score(X_test, y_test)

0.858920045726582

## Decision Tree

In [0]:
from sklearn.tree import DecisionTreeClassifier

In [16]:
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
tree_model.score(X_test, y_test)

0.8494385044717907

## Random Forest

In [0]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
model = RandomForestClassifier(n_estimators=20)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8768744536345908

## Neural Networks

In [0]:
from sklearn.neural_network import MLPClassifier

In [22]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8743191446439379