In [2]:
import numpy as np
import matplotlib.pyplot as plt
import torch 
import pandas as pd
import os, sys

In [9]:
root_path = os.getcwd()
data_path = os.path.join(root_path, "Data")

train_path = os.path.join(data_path, "train_set.json")
test_path = os.path.join(data_path, "test_set.json")
documents_path = os.path.join(data_path, "documents.json")

In [15]:
import json

f = open(train_path)
train_set = json.load(f)
f = open(test_path)
test_set = json.load(f)
f = open(documents_path)
documents = json.load(f)

In [17]:
train_df = pd.read_json(train_path)
test_df = pd.read_json(test_path)
documents_df = pd.read_json(documents_path)

In [18]:
train_df.head()

Unnamed: 0,document,summary,label
0,Two GOP presidential hopefuls - Ted Cruz and B...,Ted Cruz and Ben Carson want the charity to re...,1
1,The Tesla Model S P85D's 'insane mode' may be ...,"latvia-based drive eo has created a vehicle, n...",0
2,MI5 has issued an alert over the threat posed ...,"Alert issued over rogue workers in nuclear , t...",1
3,A new video that shows homeless people reading...,A short film highlights the nasty things peopl...,1
4,Aston Villa may be gearing up for an FA Cup se...,tim sherwood replied to a letter from charlie ...,0


In [19]:
documents_df.head()

Unnamed: 0,document
0,(CNN) -- Shelling hit areas near two key citie...
1,There aren't many NFL players who influenced t...
10,Work has got under way to repaint the striking...
100,(CNN) -- American Presidents have come and gon...
1000,Jessie Roach is 31 years old and has mental di...


> - **Train set.json**: This file contains 8000 summaries in total with (in the field summary of the json file) their original documents (in the field document of the json file) and labels (in the field label of the json file). The dataset is divided as follows: 4000 reference summaries and 4000 summaries generated from different machine summarisation systems.

> - **Test set.json:** This file contains 3600 summaries in total, divided as follows: 1600 reference summaries and 1600 summaries generated using the same models used in the train set. This dataset is distributed equally between the public and private leaderboards on kaggle.

> - **Documents.json:** This file contains 50000 original documents that could be useful to make the predictions.

In [22]:
documents_df["document"][0]

"(CNN) -- Shelling hit areas near two key cities in eastern Ukraine on Sunday morning, intensifying fears that a ceasefire that took effect less than two days ago may be falling apart.\n\nWhy is the ceasefire under strain?\n\nA variety of fighting factions in the conflict zone -- on both sides -- may not fall directly under a military chain of command. The pro-Russian rebels are mostly volunteer militias; fighting against them on the Ukrainian side are at least some far-right nationalist militias. Controlling these groups is difficult and some may have different aims, including sabotaging the truce.\n\nAt this point it's been nearly impossible to figure out who's doing the firing and why.\n\nThe conditions of the ceasefire agreement don't help either. The conditions are vague and at this point there doesn't seem to be an effective mechanism in place inside the conflict zone to monitor and enforce the agreement.\n\nWhy can't the two sides' leaders control their forces?\n\nIt's unclear i

In [23]:
train_df["document"][0]

"Two GOP presidential hopefuls - Ted Cruz and Ben Carson - want the Clinton Foundation to return every dollar its received from foreign governments since it launched more than a decade ago.\n\nThe bum rush on the non-profit came about after a report cast a new shadow over the charity's fundraising practices while Hillary Clinton served as the United State's chief diplomat.\n\nA Reuters investigation that revealed the Bill, Hillary and Chelsea Clinton Foundation had misreported millions of dollars in donations from foreign nations led the global charity to announce that it would refile more than five years of tax documents.\xa0\n\nScroll down for video\xa0\n\nA Reuters investigation that revealed the Bill, Hillary and Chelsea Clinton Foundation had misreported millions of dollars in donations from foreign nations led the global charity to announce that it would refile more than five years of tax documents - Republicans pounced\n\nThe discovery came as separate financial reviews found th

In [24]:
train_df["summary"][0]

"Ted Cruz and Ben Carson want the charity to return every dollar its received from foreign governments since its launch in 2001 . Bum rush came about after a report cast a new shadow over the charity 's fundraising practices while Hillary Clinton was the country 's chief diplomat . Cruz said : ` Having raised tens of millions of dollars from foreign nations presents a clear conflict of interest for anyone running for President ' Carson said they ` should they definitely give back the money and cease accepting foreign donations , but should also make every effort to find missing documents that would shed light if in fact they are innocent ' Carly Fiorina said , ` It 's the Clinton way : raking in millions from foreign governments behind closed doors while making promises about transparency that they never intended to keep '"

------------------------------------------------------------------------------------

### Data Analysis & Feature Generation

In [27]:
import seaborn as sns
import string
import nltk
import warnings 

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

# Get the number of dates / entries in each month
train_df.groupby('label')['summary'].count()

label
0    4000
1    4000
Name: summary, dtype: int64

In [29]:
## Character length

train_df['character_cnt'] = train_df['summary'].str.len()
train_df.groupby('label')['character_cnt'].mean()

label
0    304.60550
1    307.99875
Name: character_cnt, dtype: float64

In [31]:
train_df['word_counts'] = train_df['summary'].str.split().str.len()
train_df.groupby('label')['word_counts'].mean()

label
0    54.887
1    56.925
Name: word_counts, dtype: float64

In [32]:
train_df['characters_per_word'] = train_df['character_cnt']/train_df['word_counts']
train_df.groupby('label')['characters_per_word'].mean()

label
0    5.597578
1    5.441352
Name: characters_per_word, dtype: float64

In [33]:
# Special character count
train_df['spl'] = train_df['summary'].apply(lambda x: len([x for x in x.split() if x.startswith('@')]))

In [35]:
#Number of numerics
train_df['num'] = train_df['summary'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
train_df.groupby('label')['num'].mean()

label
0    0.67025
1    0.80450
Name: num, dtype: float64

In [37]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

train_df['processedtext'] = train_df['summary'].str.replace('[^\w\s]','') 
train_df['processedtext'] = train_df['processedtext'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train_df['processedtext'] = train_df['processedtext'].apply(lambda x: " ".join(x.lower() for x in x.split()))

#Lines 4 to 6
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
train_df['processedtext'] = train_df['processedtext'].apply(lambda x: " ".join([stemmer.stem(word) for word in x.split()]))

train_df[['character_cnt','word_counts','characters_per_word', 'spl', 'num', 'processedtext']].head()

Unnamed: 0,character_cnt,word_counts,characters_per_word,spl,num,processedtext
0,833,146,5.705479,0,1,ted cruz ben carson want chariti return everi ...
1,362,65,5.569231,0,3,latviabas drive eo creat vehicl name eo pp03 r...
2,298,54,5.518519,0,1,alert issu rogu worker nuclear transport publi...
3,221,44,5.022727,0,0,a short film highlight nasti thing peopl say h...
4,262,48,5.458333,0,0,tim sherwood repli letter charli pye tuesday p...


In [43]:
def extend_features(train_df, stop = stop, stemmer = stemmer):
    train_df['character_cnt'] = train_df['summary'].str.len()
    train_df['word_counts'] = train_df['summary'].str.split().str.len()
    train_df['characters_per_word'] = train_df['character_cnt']/train_df['word_counts']
    train_df['spl'] = train_df['summary'].apply(lambda x: len([x for x in x.split() if x.startswith('@')]))
    train_df['num'] = train_df['summary'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
    
    train_df['processedtext'] = train_df['summary'].str.replace('[^\w\s]','') 
    train_df['processedtext'] = train_df['processedtext'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    train_df['processedtext'] = train_df['processedtext'].apply(lambda x: " ".join(x.lower() for x in x.split()))

    train_df['processedtext'] = train_df['processedtext'].apply(lambda x: " ".join([stemmer.stem(word) for word in x.split()]))
    
    return train_df

In [62]:
## Term Frequency-Inverse Document Frequency (TF-IDF) Vector

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word', stop_words= 'english',ngram_range=(1,1))

dat_tfIdf = tfidf.fit_transform(train_df['processedtext'])
dat_tfIdf

<8000x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 114401 stored elements in Compressed Sparse Row format>

In [63]:
training_labels = train_df["label"]
training_df = train_df.drop(["summary", "document", "label", "processedtext"], axis = 1)

In [64]:
td_df = pd.DataFrame(dat_tfIdf.toarray())
training_df = pd.concat([training_df, td_df], axis=1)

In [65]:
training_df

Unnamed: 0,character_cnt,word_counts,characters_per_word,spl,num,0,1,2,3,4,...,990,991,992,993,994,995,996,997,998,999
0,833,146,5.705479,0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,362,65,5.569231,0,3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,298,54,5.518519,0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,221,44,5.022727,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,262,48,5.458333,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,137,24,5.708333,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.177304,0.0,0.0,0.0,0.0
7996,573,105,5.457143,0,2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
7997,86,19,4.526316,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
7998,349,70,4.985714,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [66]:
from sklearn.linear_model import LogisticRegression
import csv

## Applying LogisitcRegression

clf = LogisticRegression()
clf.fit(training_df, training_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [68]:
testing_df = extend_features(test_df, stop = stop, stemmer = stemmer)

tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word', stop_words= 'english', ngram_range=(1,1))
test_tfIdf = tfidf.fit_transform(testing_df['processedtext'])

testing_df = testing_df.drop(["summary", "document", "processedtext"], axis = 1)
test_tfIdf

<3200x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 48298 stored elements in Compressed Sparse Row format>

In [69]:
td_df = pd.DataFrame(test_tfIdf.toarray())
testing_df = pd.concat([testing_df, td_df], axis=1)

In [70]:
predictions = clf.predict(testing_df)

In [71]:
# Write predictions to a file
with open("submission2_tfid1000.csv", "w") as pred:
    csv_out = csv.writer(pred)
    csv_out.writerow(['id','label'])
    for i, row in enumerate(predictions):
        csv_out.writerow([i, row])

In [72]:
## Bag-of-words Vector
from sklearn.feature_extraction.text import CountVectorizer

bag_words = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
dat_BOW = bag_words.fit_transform(train_df['processedtext'])
dat_BOW

<8000x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 133155 stored elements in Compressed Sparse Row format>

In [73]:
td_df = pd.DataFrame(dat_BOW.toarray())
training_df = pd.concat([training_df, td_df], axis=1)

In [74]:
## Applying LogisitcRegression
clf = LogisticRegression()
clf.fit(training_df, training_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [75]:
bag_words = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
dat_BOW = bag_words.fit_transform(test_df['processedtext'])

In [76]:
td_df = pd.DataFrame(dat_BOW.toarray())
testing_df = pd.concat([testing_df, td_df], axis=1)

In [78]:
predictions = clf.predict(testing_df)

# Write predictions to a file
with open("submission2_BOW.csv", "w") as pred:
    csv_out = csv.writer(pred)
    csv_out.writerow(['id','label'])
    for i, row in enumerate(predictions):
        csv_out.writerow([i, row])