In [26]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


# Read The data
training_set = pd.read_json('./data/train_set.json')
test_set = pd.read_json('./data/test_set.json')

# Use logistic regression to predict the class

# 1. word2vec
vectorizer = TfidfVectorizer(max_features=10000)
X = vectorizer.fit_transform(training_set['text'])

In [27]:
training_set.shape, training_set.columns

((4000, 3), Index(['id', 'text', 'label'], dtype='object'))

In [32]:
training_set["length"] = training_set["text"].apply(len)
training_set["words"] = training_set["text"].apply(lambda x: len(x.split(" ")))
training_set["capital"] = training_set["text"].apply(lambda x: 1 if x[0].capitalize() == x[0] else 0)

In [33]:
training_set.head(5)

Unnamed: 0,id,text,label,length,words,capital
0,0,Remains dating to the 5th century were found i...,1,426,82,1
1,1,Professional Identification is a type of socia...,1,1429,192,1
2,2,A magistrate has refused a media ban on the tr...,1,360,67,1
3,3,The Conservation Commons is the expression of ...,1,1403,201,1
4,4,Myer is holding a 75 per cent of designer labe...,0,325,57,1


In [34]:
positive = training_set[training_set["label"] == 1]
positive.describe()

Unnamed: 0,id,label,length,words,capital
count,2016.0,2016.0,2016.0,2016.0,2016.0
mean,2002.933532,1.0,567.970734,97.464782,0.995536
std,1160.984084,0.0,399.284814,63.877222,0.066683
min,0.0,1.0,64.0,14.0,0.0
25%,980.25,1.0,268.0,48.0,1.0
50%,2003.5,1.0,355.0,66.0,1.0
75%,3023.5,1.0,957.25,158.0,1.0
max,3999.0,1.0,3524.0,678.0,1.0


In [35]:
negative = training_set[training_set["label"] == 0]
negative.describe()

Unnamed: 0,id,label,length,words,capital
count,1984.0,1984.0,1984.0,1984.0,1984.0
mean,1996.011089,0.0,598.970766,102.265121,0.835685
std,1148.855191,0.0,438.125692,71.131035,0.370654
min,4.0,0.0,78.0,14.0,0.0
25%,1024.5,0.0,276.0,49.0,1.0
50%,1994.5,0.0,362.5,70.0,1.0
75%,2973.5,0.0,1011.0,170.0,1.0
max,3998.0,0.0,1731.0,273.0,1.0


In [38]:
negative_minuscule = training_set[(training_set["label"] == 0) & (training_set["capital"] == 0)]
negative_minuscule.describe()

Unnamed: 0,id,label,length,words,capital
count,326.0,326.0,326.0,326.0,326.0
mean,1987.509202,0.0,298.064417,52.06135,0.0
std,1139.654306,0.0,149.262841,24.869182,0.0
min,5.0,0.0,78.0,14.0,0.0
25%,986.75,0.0,212.25,37.0,0.0
50%,2010.5,0.0,278.0,49.0,0.0
75%,2972.25,0.0,353.75,62.0,0.0
max,3992.0,0.0,1517.0,248.0,0.0


In [43]:
import csv
test_label_gpt = []
with open("submission_gpt_66.csv", "r") as pred:
    csv_in = csv.reader(pred)
    for i, row in enumerate(csv_in):
        if i>=1:
            test_label_gpt.append(eval(row[1]))


test_label_xgboost = []
with open("submission_xgboost_83.csv", "r") as pred:
    csv_in = csv.reader(pred)
    for i, row in enumerate(csv_in):
        if i>=1:
            test_label_xgboost.append(eval(row[1]))

In [81]:
test_set["capital"] = test_set["text"].apply(lambda x: 1 if x[0].capitalize() == x[0] else 0)
test_set["gpt"] = test_label_gpt
test_set["xgboost"] = test_label_xgboost


In [59]:
test_minuscule = test_set[ test_set["capital"] == 0]
test_minuscule.describe()

Unnamed: 0,id,capital,gpt,xgboost,label
count,334.0,334.0,334.0,334.0,334.0
mean,2025.035928,0.0,0.976048,0.020958,0.0
std,1129.930447,0.0,0.153129,0.143459,0.0
min,15.0,0.0,0.0,0.0,0.0
25%,1008.0,0.0,1.0,0.0,0.0
50%,2058.0,0.0,1.0,0.0,0.0
75%,2993.25,0.0,1.0,0.0,0.0
max,3984.0,0.0,1.0,1.0,0.0


In [62]:
# Read The data
training_set = pd.read_json('./data/train_set.json')
test_set = pd.read_json('./data/test_set.json')


test_set["label"] = test_set["text"].apply(lambda x: 1 if x[0].capitalize() == x[0] else 0)
test_minuscule = test_set[ test_set["label"] == 0]

In [69]:
new_train = pd.concat((training_set, test_minuscule), axis=0)
new_train = new_train.sample(frac=1).reset_index(drop=True)
new_train.to_json("data/train_set_aug.json")

In [70]:
len(new_train)

4334

In [77]:
training_set = pd.read_json('./data/train_set_aug.json')
training_set["text"].to_list()[0]

"The video shows two men and one woman entering the Los Angeles home . They 're seen tip-toeing through the house before entering another room . But then one suspect returns and looks straight at the camera before he ducks and knocks it down . The LAPD released the film in hopes it will help catch the burglars , described as being age 17 to 20 ."

In [79]:
test_label_aug = []
with open("submission_bert_aug.csv", "r") as pred:
    csv_in = csv.reader(pred)
    for i, row in enumerate(csv_in):
        if i>=1:
            test_label_aug.append(eval(row[1]))

In [83]:
test_set["capital"] = test_set["text"].apply(lambda x: 1 if x[0].capitalize() == x[0] else 0)
test_set["aug"] = test_label_aug


Unnamed: 0,id,text,label,capital,gpt,xgboost,aug
0,0,David Cameron revealed he and his wife Samanth...,1,1,1,0,0
1,1,David Cameron appeared to forget which footbal...,1,1,1,0,0
2,2,Ebola tests for a senior doctor has come back ...,1,1,1,1,1
3,3,Celebrity chef's Fat Duck named eighth best re...,1,1,1,1,1
4,4,"Navinder Singh Sarao , 36 , is accused of maki...",1,1,1,0,0
...,...,...,...,...,...,...,...
3995,3995,"Bob Katter , federal MP for Kennedy , is of Le...",1,1,1,0,0
3996,3996,Man came to hospital complaining his thumb hur...,1,1,1,1,1
3997,3997,London weighting is an allowance paid to certa...,1,1,1,1,0
3998,3998,"The Toronto Signals Band (abbreviated to ""Sigs...",1,1,0,0,0


In [88]:
import numpy as np
np.sum(test_set["aug"] == test_set["xgboost"]), np.sum(test_set["gpt"] == test_set["xgboost"])

(3710, 2608)

In [95]:
test_minuscule = test_set[ test_set["label"] == 0]


Unnamed: 0,id,text,label,capital,gpt,xgboost,aug
15,15,"nathan brown, 19, was working with his father ...",0,0,1,0,0
60,60,mass murderer peter sutcliffe is to be moved t...,0,0,1,0,0
116,116,casey levi filmed the moment he tried to get h...,0,0,1,0,0
122,122,wayne kyle gave his first interview since the ...,0,0,1,0,0
140,140,a newly-released video has highlighted the sho...,0,0,1,0,0


In [96]:
test_minuscule.describe()

Unnamed: 0,id,label,capital,gpt,xgboost,aug
count,334.0,334.0,334.0,334.0,334.0,334.0
mean,2025.035928,0.0,0.0,0.976048,0.020958,0.011976
std,1129.930447,0.0,0.0,0.153129,0.143459,0.108941
min,15.0,0.0,0.0,0.0,0.0,0.0
25%,1008.0,0.0,0.0,1.0,0.0,0.0
50%,2058.0,0.0,0.0,1.0,0.0,0.0
75%,2993.25,0.0,0.0,1.0,0.0,0.0
max,3984.0,0.0,0.0,1.0,1.0,1.0
