In [1]:
import numpy as np
import pandas as pd
import nltk
train_data=pd.read_csv("train.csv")
test_data=pd.read_csv("test.csv")
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
train_data.shape,test_data.shape

((7613, 5), (3263, 4))

In [3]:
train_text=train_data.text
test_text=test_data.text

In [4]:
y=train_data.target

In [5]:
import re

In [7]:
def clean_text(text):
    text=text.lower()
    text=re.sub('#','',text)
    text=re.sub('[^a-zA-Z ]','',text)
    return text

In [8]:
train_text.head()

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object

In [9]:
train_text=train_text.apply(clean_text)
test_text=test_text.apply(clean_text)

In [10]:
train_text.head()

0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to shelter in place are be...
3     people receive wildfires evacuation orders in...
4    just got sent this photo from ruby alaska as s...
Name: text, dtype: object

In [11]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemmatizer=WordNetLemmatizer()

In [12]:
train_sequence=[]
for i in range(len(train_text)):
    words=nltk.word_tokenize(train_text.iloc[i])
    words=[lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    sent=' '.join(words)
    train_sequence.append(sent)

In [13]:
len(train_sequence)

7613

In [14]:
train_sequence[0]

'deed reason earthquake may allah forgive u'

In [15]:
test_sequence=[]
for i in range(len(test_text)):
    words=nltk.word_tokenize(test_text.iloc[i])
    words=[lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    sent=' '.join(words)
    test_sequence.append(sent)

In [16]:
len(test_sequence)

3263

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
tfidf=TfidfVectorizer(min_df=2,ngram_range=(1,3),max_features=10000)

In [19]:
vectorized_train=tfidf.fit_transform(train_sequence)

In [20]:
vectorized_train.shape

(7613, 10000)

In [21]:
vectorized_test=tfidf.transform(test_sequence)

In [22]:
vectorized_test.shape

(3263, 10000)

In [23]:
vectorized_train=vectorized_train.toarray()
vectorized_test=vectorized_test.toarray()

In [24]:
vectorized_train[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
from sklearn.model_selection import train_test_split

In [61]:
X_train, X_test, y_train, y_test = train_test_split(vectorized_train,y,test_size=0.2,random_state=0)

In [75]:
classifier=LogisticRegression(C=3)

In [76]:
classifier.fit(X_train,y_train)

LogisticRegression(C=3, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [77]:
classifier.score(X_test,y_test)

0.8023637557452397

In [182]:
y_pred=classifier.predict(vectorized_test)

1