In [112]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

#### Data importing and exploration

In [113]:
## importing data
df = pd.read_csv(r'Data\train.csv')

In [114]:
df.sample(frac=1)

Unnamed: 0,id,keyword,location,text,target
4318,6132,hellfire,,The Prophet (peace be upon him) said 'Save you...,1
353,506,army,cyprus,Build your own kingdom and lead your army to v...,0
6291,8987,storm,Santiago de Chile,Doves - The Storm + Greatest Denier (Electric ...,1
1101,1593,bombed,,The majority of those killed were civilians on...,1
5362,7651,panic,,Love it when Jesse suffers a panic attack. htt...,0
...,...,...,...,...,...
6895,9884,traumatised,North East / Middlesbrough,Sending a snapchat to the wrong person instead...,0
3835,5458,first%20responders,"Sacramento, CA",As firefighters make gains on #RockyFire Jerry...,1
1328,1919,burning,New York,2 Burning Man Tickets + Vehicle Pass - Full re...,0
3262,4687,engulfed,london,@suelinflower there is no words to describe th...,0


In [115]:
df.sample(frac=1)

Unnamed: 0,id,keyword,location,text,target
256,364,annihilation,,@CalFreedomMom @steph93065 not to mention a ma...,1
4407,6265,hijacking,tokyo,#hot Funtenna: hijacking computers to send da...,0
5078,7241,natural%20disaster,America of Founding Fathers,This is the natural and unavoidable consequenc...,0
5113,7292,nuclear%20disaster,Fukushima city Fukushima.pref,Over half of poll respondents worry nuclear di...,0
157,225,airplane%20accident,"Lehigh Valley, PA",Strict liability in the context of an airplane...,1
...,...,...,...,...,...
1898,2729,crushed,"Ontario, Canada.",Jesus Christ that ball was fucking crushed!! #...,0
4388,6234,hijacker,,Governor allows parole for California school b...,1
5040,7186,mudslide,,HE CALLED IT A MUDSLIDE AW,0
460,664,attack,,Delhi Government to Provide Free Treatment to ...,1


In [116]:
# Dropping ID and location columns
df.drop(columns=['id', 'location'], inplace=True)

In [117]:
# Removing duplicates
print(df.duplicated().sum())
df.drop_duplicates(inplace=True)

72


In [118]:
df.sample(frac=1)

Unnamed: 0,keyword,text,target
4294,hellfire,Hellfire is surrounded by desires so be carefu...,0
783,blight,@kynespeace *blight,0
4130,hailstorm,@CouncilSCC it does say hailstorm,1
6115,sinking,Sinking carb consultative assembly plans could...,0
6915,trouble,@canagal Good to hear it's back.. that storm's...,1
...,...,...,...
2672,detonate,IÛªve just signed up for the Detonate Notting...,0
2250,deluged,Why are you deluged with low self-image? Take ...,1
5088,nuclear%20disaster,http://t.co/GaM7otGISw\nANOTHER DISASTER WAITI...,1
2471,derailment,Train derailment: In Patna no news of any casu...,1


In [119]:
# Dropping null values
print(df.isna().sum())
df.dropna(inplace=True)

keyword    61
text        0
target      0
dtype: int64


In [120]:
df.sample(frac=1)

Unnamed: 0,keyword,text,target
5009,military,the MOFO in DC will leave our military unarmed...,0
4506,hurricane,Hurricane Dancers: The First Caribbean Pirate ...,0
1816,crashed,Bin Laden family plane crashed after 'avoiding...,1
7442,wounds,RT @DianneG: Gunshot wound #9 is in the bicep....,0
5435,police,Maid charged with stealing Dh30000 from police...,0
...,...,...,...
1848,crush,This guy idk just made me his woman crush ?? f...,0
5002,military,@UniversityofLaw For the people who died in Hu...,1
1318,burning,@nagel_ashley @Vicken52 @BasedLaRock @goonc1ty...,0
1183,bridge%20collapse,@followlasg This is urgentthere is currently a...,1


#### Data and text preprocessing

In [121]:
# Label encoding keyword column
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['keyword'] = le.fit_transform(df['keyword'])
df.sample(frac=1)

Unnamed: 0,keyword,text,target
640,18,@Kaotix_Blaze craving u,0
2128,61,ÛÏ@LOLGOP: 2.2 cases of voter fraud a year. \...,1
7041,205,Obama Declares Disaster for Typhoon-Devastated...,1
6808,198,Rly tragedy in MP: Some live to recount horror...,1
6357,184,Virgin galactic crash: early unlocking of brak...,1
...,...,...,...
3928,112,iembot_hfo : At 10:00 AM 2 NNW Hana [Maui Co H...,1
6276,182,New item: Pillow Covers ANY SIZE Pillow Cover ...,0
5010,145,Stu Dorret's mudslide rubber tyre cake may hav...,0
2003,57,@HfxStanfield @beelieveDC @DiscoveryCntr what ...,1


In [122]:
df

Unnamed: 0,keyword,text,target
31,0,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,0,We always try to bring the heavy. #metal #RT h...,0
33,0,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,0,Crying out for more! Set me ablaze,0
35,0,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0
...,...,...,...
7578,220,@jt_ruff23 @cameronhacker and I wrecked you both,0
7579,220,Three days off from work and they've pretty mu...,0
7580,220,#FX #forex #trading Cramer: Iger's 3 words tha...,0
7581,220,@engineshed Great atmosphere at the British Li...,0


In [123]:
# Reshuffling the dataset
df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,keyword,text,target
0,202,why is it trouble@niallhariss / @simply_vain l...,0
1,140,Kach was a group to which belonged Baruch Gold...,1
2,177,Do you feel like you are sinking in unhappines...,0
3,173,'[+54 -9] How do people not know who Kendall J...,0
4,96,Evacuation order lifted for town of Roosevelt ...,1
...,...,...,...
7475,75,Megadeth Week - Symphony Of Destruction http:/...,0
7476,220,http://t.co/DeQQOpSP4f: Iger's 3 words that wr...,0
7477,99,Worked at a fast food joint. Poured burnt hot ...,0
7478,81,@peterjukes A crime that killed/displaced mill...,1


In [124]:
# Preprocessing text
import re
import contractions
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

def text_preprocess(text):
    # Fixing contractions
    text = contractions.fix(text)
    
    # Removing non-alphabetical characters
    text = re.sub('[^a-zA-Z\s]', '', text)
    
    # Converting to lowercase
    text = text.lower()
    
    # Removing stopwords
    s_words = stopwords.words('english')
    s_words.remove('not')
    text = text.split()
    ps = PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text if word not in s_words])
    
    
    return text

In [125]:
df['text'] = df['text'].apply(text_preprocess)

In [126]:
df_raw = df.copy()

In [96]:
df.sample(frac=1)

Unnamed: 0,keyword,text,target
2857,155,sav contact fell convinc not amp went guag eye...,0
6694,63,wrinkl face delug decay,0
6931,28,anoth one anoth one still not done shit one ni...,1
3391,64,thesewphist whoever hold address fuckfacewinei...,0
6562,192,pakistan suprem court rule allow militari tria...,1
...,...,...,...
1680,155,time panic httpstcoorxdqfzj,0
1757,40,news wrap un warn femal child casualti rise af...,1
5565,145,oso washington mudslid respons interview part ...,1
1713,32,japan mark th anniversari hiroshima atom bomb ...,1


#### Word2vec

In [97]:
# Downloading pre-trained model
from gensim.models import KeyedVectors
model_path = r"E:\10k\NLP\GoogleNews-vectors-negative300.bin.gz"
w2v_model = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [180]:
# Returning the mean Numpy arrays present in Word2vec vocab of our corpus
def vectorize(text, word2vec_model):
    text = [word for word in text if word in word2vec_model.key_to_index]
    return np.mean(word2vec_model[text], axis=0)

In [181]:
# Converting text column into np arrays
for i in range(len(df['text'].values)):
    df['text'][i] = vectorize(df['text'][i], w2v_model)

In [182]:
df

Unnamed: 0,keyword,text,target
0,202,"[-0.17875744, 0.11705526, -0.00011044457, 0.12...",0
1,140,"[-0.18359995, 0.10834183, 0.0064431895, 0.1303...",1
2,177,"[-0.18566668, 0.103693075, -0.008680556, 0.157...",0
3,173,"[-0.14753418, 0.10582764, -0.023497315, 0.1483...",0
4,96,"[-0.18159993, 0.11469297, -0.029969392, 0.1329...",1
...,...,...,...
7475,75,"[-0.19370623, 0.12955977, -0.03229355, 0.15909...",0
7476,220,"[-0.18161882, 0.11741594, -0.02065604, 0.16373...",0
7477,99,"[-0.19840786, 0.1155665, -0.027167778, 0.13892...",0
7478,81,"[-0.18706846, 0.116167955, -0.008435747, 0.144...",1


In [183]:
df['text']

0       [-0.17875744, 0.11705526, -0.00011044457, 0.12...
1       [-0.18359995, 0.10834183, 0.0064431895, 0.1303...
2       [-0.18566668, 0.103693075, -0.008680556, 0.157...
3       [-0.14753418, 0.10582764, -0.023497315, 0.1483...
4       [-0.18159993, 0.11469297, -0.029969392, 0.1329...
                              ...                        
7475    [-0.19370623, 0.12955977, -0.03229355, 0.15909...
7476    [-0.18161882, 0.11741594, -0.02065604, 0.16373...
7477    [-0.19840786, 0.1155665, -0.027167778, 0.13892...
7478    [-0.18706846, 0.116167955, -0.008435747, 0.144...
7479    [-0.18513489, 0.13775864, -0.030007934, 0.1407...
Name: text, Length: 7480, dtype: object

#### ML Model

In [184]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [185]:
X = np.hstack((df['keyword'].values.reshape(-1,1), np.vstack(df['text'].values))) # Horizonatally stacking the int column as a 1d array along with the vertical stack of the text multi-dimensional array 
y = df['target'].values

In [186]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

clf = LogisticRegression()
clf.fit(X_train, y_train) # Training the model
y_pred = clf.predict(X_test) # Getting predictions
acc = accuracy_score(y_test, y_pred)
print('Acc:', acc)

Acc: 0.6116310160427807
