# Importing Libraries and csv Files

In [9]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/elinakelly/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/elinakelly/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/elinakelly/nltk_data...


True

In [11]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [12]:
import re
# https://www.scaler.com/topics/remove-special-characters-from-string-python/
# re : Regular expressions form a pattern we can use to search for special characters in a string. 
# This property is used in functions to remove special characters from strings in python

In [13]:
#train = pd.read_csv('/Users/elinakelly/Desktop/JuanJo Kaggle/Kaggle Data 1/train.csv')
#test = pd.read_csv('/Users/elinakelly/Desktop/JuanJo Kaggle/Kaggle Data 1/test.csv')

In [14]:
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')

In [15]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [16]:
train.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [17]:
train.dtypes

id           int64
keyword     object
location    object
text        object
target       int64
dtype: object

In [18]:
# COUNT NUMBER OF REAL DISASTER TWEETS
train["target"].value_counts()

0    4342
1    3271
Name: target, dtype: int64

# I/ Pre-Processing

-> Remove site links

In [19]:
train['text'] = train['text'].str.replace('http[^\s]*',"")
test['text'] = test['text'].str.replace('http[^\s]*',"")

-> Remove special characters

In [20]:
train['txt'] = train['text'].apply(lambda x : re.sub("[^A-Z]", " ", str(x),0,re.IGNORECASE))
test['txt'] = test['text'].apply(lambda x : re.sub("[^A-Z]", " ", str(x),0,re.IGNORECASE))

-> Convert to string

In [21]:
train['txt'] = train['txt'].astype('string')
test['txt'] = test['txt'].astype('string')

In [22]:
train['txt']

0       Our Deeds are the Reason of this  earthquake M...
1                  Forest fire near La Ronge Sask  Canada
2       All residents asked to  shelter in place  are ...
3              people receive  wildfires evacuation or...
4       Just got sent this photo from Ruby  Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609     aria ahrary  TheTawniest The out of control w...
7610          M            UTC   km S of Volcano Hawaii  
7611    Police investigating after an e bike collided ...
7612    The Latest  More Homes Razed by Northern Calif...
Name: txt, Length: 7613, dtype: string

-> Remove stopwords

In [23]:
stop_words = stopwords.words('english')
train['txt'] = train['txt'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
test['txt'] = test['txt'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

-> Convert to Lower Case

In [24]:
train['txt'] = train['txt'].apply(lambda x: x.lower())
test['txt'] = test['txt'].apply(lambda x: x.lower())

-> Remove Strings with 1 letter

In [25]:
train['txt'] =  train['txt'].apply(lambda x: ' '.join( [w for w in x.split() if len(w)>1] ))
test['txt'] =  test['txt'].apply(lambda x: ' '.join( [w for w in x.split() if len(w)>1] ))

In [26]:
train['txt']

0        our deeds reason earthquake may allah forgive us
1                   forest fire near la ronge sask canada
2       all residents asked shelter place notified off...
3       people receive wildfires evacuation orders cal...
4       just got sent photo ruby alaska smoke wildfire...
                              ...                        
7608    two giant cranes holding bridge collapse nearb...
7609    aria ahrary thetawniest the control wild fires...
7610                                utc km volcano hawaii
7611    police investigating bike collided car little ...
7612    the latest more homes razed northern californi...
Name: txt, Length: 7613, dtype: object

-> Tokenisation & Lemmatisation

In [27]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

train['txt'] = train['txt'].apply(lemmatize_text)
test['txt'] = test['txt'].apply(lemmatize_text)

In [28]:
train['txt'] = train['txt'].apply(lambda x: ' '.join( [w for w in x]))
test['txt'] = test['txt'].apply(lambda x: ' '.join( [w for w in x]))

In [29]:
train['txt']

0          our deed reason earthquake may allah forgive u
1                   forest fire near la ronge sask canada
2       all resident asked shelter place notified offi...
3       people receive wildfire evacuation order calif...
4       just got sent photo ruby alaska smoke wildfire...
                              ...                        
7608    two giant crane holding bridge collapse nearby...
7609    aria ahrary thetawniest the control wild fire ...
7610                                utc km volcano hawaii
7611    police investigating bike collided car little ...
7612    the latest more home razed northern california...
Name: txt, Length: 7613, dtype: object

# II/ Applying Algorithms

## 1. Ridge Classifier (-> NO)

In [180]:
count_vectorizer = feature_extraction.text.CountVectorizer()
## let's get counts for the first 5 tweets in the data
train_vectors = count_vectorizer.fit_transform(train['txt'])

In [181]:
test_vectors = count_vectorizer.transform(test["txt"])

In [182]:
## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression 
## is a good way to do this.
clf = linear_model.RidgeClassifier()

In [183]:
scores = model_selection.cross_val_score(clf, train_vectors, train["target"], cv=3, scoring="f1")
scores

array([0.6104245 , 0.53306813, 0.60749507])

In [184]:
clf.fit(train_vectors, train["target"])

RidgeClassifier()

In [185]:
test["pred"] = clf.predict(test_vectors)

In [186]:
display(test[["txt","pred"]])
test.style.set_properties(subset=['text'], **{'width': '3000px'})

Unnamed: 0,txt,pred
0,just happened terrible car crash,0
1,heard earthquake different city stay safe ever...,1
2,forest fire spot pond goose fleeing across str...,1
3,apocalypse lighting spokane wildfire,1
4,typhoon soudelor kill china taiwan,1
...,...,...
3258,earthquake safety los angeles safety fastener ...,1
3259,storm ri worse last hurricane my city amp othe...,1
3260,green line derailment chicago,1
3261,meg issue hazardous weather outlook hwo,1


Unnamed: 0,id,keyword,location,text,txt,pred
0,0,,,Just happened a terrible car crash,just happened terrible car crash,0
1,2,,,"Heard about #earthquake is different cities, stay safe everyone.",heard earthquake different city stay safe everyone,1
2,3,,,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all",forest fire spot pond goose fleeing across street cannot save,1
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfire,1
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kill china taiwan,1
5,12,,,We're shaking...It's an earthquake,we shaking it earthquake,1
6,21,,,"They'd probably still show more life than Arsenal did yesterday, eh? EH?",they probably still show life arsenal yesterday eh eh,0
7,22,,,Hey! How are you?,hey how,0
8,27,,,What a nice hat?,what nice hat,0
9,29,,,Fuck off!,fuck,0


## 2. Latent Semantic Analysis

In [1]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
import scipy.cluster.hierarchy as shc

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image

%matplotlib inline
sns.set_style("darkgrid")

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from sklearn.linear_model import LogisticRegression
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from matplotlib import cm
import numpy as np
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import silhouette_score
from sklearn.decomposition import TruncatedSVD, randomized_svd
import string

import warnings
warnings.filterwarnings('ignore')

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import re
tokenizer = RegexpTokenizer(r'\b\w{3,}\b')

In [4]:
sample_tfidf = TfidfVectorizer(lowercase=True, 
                      #  stop_words=stop_words, 
                        tokenizer=tokenizer.tokenize,
#                         max_df = 0.8,
                        min_df = 0.1
                       )

In [5]:
sample_tfidf

TfidfVectorizer(min_df=0.1,
                tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='\\b\\w{3,}\\b', gaps=False, discard_empty=True, flags=re.UNICODE|re.MULTILINE|re.DOTALL)>)

In [32]:
sample_sparse = sample_tfidf.fit_transform(train[:20])

In [34]:
sample_exp = pd.DataFrame(sample_sparse.toarray(), 
                        columns=sample_tfidf.get_feature_names())

sample_exp.head()

Unnamed: 0,keyword,location,target,text,txt
0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,0.0
