In [1]:
import numpy as np 
import pandas as pd

import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [4]:
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)
import string
punct=string.punctuation
from spacy.lang.en import English

In [5]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [6]:
def tokenizerX(x):
    doc = nlp(x)
    
    tokens = [token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_ for token in doc]     
    final_tokens = [token for token in tokens if token not in stopwords and token not in punct]
    return final_tokens

- If the hyperplane classifies the dataset linearly then the algorithm we call it as SVC and the algorithm that separates the dataset by non-linear approach then we call it as SVM.

In [7]:
from sklearn.naive_bayes import CategoricalNB

In [8]:
tfidf = TfidfVectorizer(tokenizer = tokenizerX)
model = LinearSVC()
#classifier = CategoricalNB()

In [9]:
x = train['text']
y = train['target']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [11]:
model_pipeline = Pipeline([('tfidf', tfidf), ('model', model)])

In [12]:
model_pipeline.fit(X_train,y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function tokenizerX at 0x7f983c9d9d30>)),
                ('model', LinearSVC())])

In [13]:
y_pred = model_pipeline.predict(X_test)

In [14]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.82      0.81       874
           1       0.75      0.72      0.74       649

    accuracy                           0.78      1523
   macro avg       0.77      0.77      0.77      1523
weighted avg       0.78      0.78      0.78      1523



In [215]:
#y_pred=model.predict(test['text'])

## Identify the label of each tweet

In [45]:
X_test

2644    So you have a new weapon that can cause un-ima...
2227    The f$&amp;@ing things I do for #GISHWHES Just...
5448    DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...
132     Aftershock back to school kick off was great. ...
6845    in response to trauma Children of Addicts deve...
                              ...                        
1835                @SmusX16475 Skype just crashed u host
506     Christian Attacked by Muslims at the Temple Mo...
3592    Man charged over fatal crash near Dubbo refuse...
6740    #usNWSgov Severe Weather Statement issued Augu...
1634    Great British &lt;b&gt;Bake&lt;/b&gt; Off's ba...
Name: text, Length: 1523, dtype: object

In [128]:
len(y_test)

1523

In [134]:
y_test[3592]

1

In [132]:
X_test[3592]

'Man charged over fatal crash near Dubbo refused bail http://t.co/HDBMfOVUtZ via @dailyliberal'

In [170]:
x_test = pd.DataFrame(X_test)
y_test1 = pd.DataFrame(y_test)

In [212]:
x_test.reset_index()
x_test.head(10)

Unnamed: 0,text
2644,So you have a new weapon that can cause un-ima...
2227,The f$&amp;@ing things I do for #GISHWHES Just...
5448,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...
132,Aftershock back to school kick off was great. ...
6845,in response to trauma Children of Addicts deve...
5559,@Calum5SOS you look like you got caught in a r...
1765,my favorite lady came to our volunteer meeting...
1817,@brianroemmele UX fail of EMV - people want to...
6810,Can't find my ariana grande shirt this is a f...
4398,The Murderous Story Of AmericaÛªs First Hijac...


In [213]:
y_test1.reset_index()
y_test1.head(10)

Unnamed: 0,target
2644,1
2227,0
5448,1
132,0
6845,0
5559,0
1765,1
1817,1
6810,0
4398,1


In [51]:
len(X_test)

1523

In [102]:
range(idx)

range(0, 1523)

In [203]:
def get_review_and_class(i):
    print(x_test.iloc[i]['text'])
    if y_test1.iloc[i]['target'] == 1:
      print('This is a disaster tweet!')
    elif y_test1.iloc[i]['target'] == 0:
      print('This is not a disaster tweet!')

In [214]:
get_review_and_class(4)

in response to trauma Children of Addicts develop a defensive self - one that decreases vulnerability. (3
This is not a disaster tweet!


## Predict labels for new input tweet

In [208]:
new_input = ['The sky is stormy']

In [209]:
y_pred = model_pipeline.predict(new_input)
y_pred

array([1])