In [73]:
import pandas as pd
import numpy as np

In [74]:
df_train = pd.read_csv('train.csv')

In [75]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [76]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [77]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how does",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
" u ": " you ",
" ur ": " your ",
" n ": " and "}

In [78]:
import re
def get_clean_text(x):
    if type(x) is str:
        x = x.lower()
        for key in contractions:
            value = contractions[key]
            x = x.replace(key, value)
            #regex to remove emails and replace with space
            x = re.sub(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', '', x) 
              #regex to remove URLs
            x = re.sub(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', x)
            #removing html tags
            #removing punctuation and numbers
            x = re.sub(r'[^a-zA-Z]', ' ', x)
            
            clean = re.compile('<.*?>')
            x= re.sub(clean, '', x)
           
            return x
    else:
        return x

In [79]:
df_train['text'] = df_train['text'].apply(lambda x: get_clean_text(x)) 

In [80]:
df_train['text']

0       our deeds are the reason of this  earthquake m...
1                  forest fire near la ronge sask  canada
2       all residents asked to  shelter in place  are ...
3              people receive  wildfires evacuation or...
4       just got sent this photo from ruby  alaska as ...
                              ...                        
7608    two giant cranes holding a bridge collapse int...
7609     aria ahrary  thetawniest the out of control w...
7610          m            utc   km s of volcano hawaii  
7611    police investigating after an e bike collided ...
7612    the latest  more homes razed by northern calif...
Name: text, Length: 7613, dtype: object

In [81]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop = stopwords.words('english')
pat = r'\b(?:{})\b'.format('|'.join(stop))
df_train['text'] = df_train['text'].str.replace(pat, '')
df_train['text'] = df_train['text'].str.replace(r'\s+', ' ')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DeLL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  df_train['text'] = df_train['text'].str.replace(pat, '')
  df_train['text'] = df_train['text'].str.replace(r'\s+', ' ')


In [82]:
df_train['text'] = df_train['text'].str.replace('[^\w\s]','') #to remove punctuation

  df_train['text'] = df_train['text'].str.replace('[^\w\s]','') #to remove punctuation


In [83]:
vectorizer = CountVectorizer()
x = df_train['text']
y= df_train['target']

x = vectorizer.fit_transform(x)
x

<7613x16052 sparse matrix of type '<class 'numpy.int64'>'
	with 67053 stored elements in Compressed Sparse Row format>

In [84]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [85]:
clf = LinearSVC()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

In [86]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))


[[762 124]
 [191 446]]


In [87]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.86      0.83       886
           1       0.78      0.70      0.74       637

    accuracy                           0.79      1523
   macro avg       0.79      0.78      0.78      1523
weighted avg       0.79      0.79      0.79      1523



In [88]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [89]:
knn = KNeighborsClassifier(n_neighbors=101)
knn.fit(x_train, y_train)
y_predict = knn.predict(x_test)

In [90]:
print('Accuracy: ', metrics.accuracy_score(y_test, y_predict))

Accuracy:  0.5817465528562049


In [91]:
print(confusion_matrix(y_test, y_predict))

[[886   0]
 [637   0]]


In [92]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.58      1.00      0.74       886
           1       0.00      0.00      0.00       637

    accuracy                           0.58      1523
   macro avg       0.29      0.50      0.37      1523
weighted avg       0.34      0.58      0.43      1523



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
