## Importing Libraries & creating dataframes.

In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
df1=pd.read_csv('train.csv')
df2=pd.read_csv('test.csv')
result=pd.read_csv('sample_submission.csv')

In [4]:
train_df=df1.loc[:,'text']   #7613
test_df=df2.loc[:,'text']   # 3263

train_label=df1.loc[:,'target'].values  #7613

In [5]:
merged_features=pd.concat([train_df,test_df]).reset_index(drop=True)  #10876
merged_features 


0        Our Deeds are the Reason of this #earthquake M...
1                   Forest fire near La Ronge Sask. Canada
2        All residents asked to 'shelter in place' are ...
3        13,000 people receive #wildfires evacuation or...
4        Just got sent this photo from Ruby #Alaska as ...
                               ...                        
10871    EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
10872    Storm in RI worse than last hurricane. My city...
10873    Green Line derailment in Chicago http://t.co/U...
10874    MEG issues Hazardous Weather Outlook (HWO) htt...
10875    #CityofCalgary has activated its Municipal Eme...
Name: text, Length: 10876, dtype: object

# Cleaning merged features and creating sparse matrix.

In [6]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus=[]     # To store cleaned words

for i in range (0,10875):
    
    tweet=re.sub('[^a-zA-Z]',' ', merged_features[i])  # Removing non-alphabatical charecters
    tweet=tweet.lower()  # making tweets in lower case
    tweet=tweet.split()  # Splitting tweets into words
    ps=PorterStemmer()
    tweet=[ps.stem(words) for words in tweet if not words in set(stopwords.words('english'))]
    tweet=' '.join(tweet)
    corpus.append(tweet)

from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=18800)
sparse=cv.fit_transform(corpus).toarray() 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
sparse

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [8]:
sparse.shape

(10875, 18800)

In [9]:
type(sparse)

numpy.ndarray

## Splitting merged sparse matrix into train & test sparse matrix.

In [10]:
train_feat=sparse[:7613,:]

test_feat=sparse[7613:,:]

In [11]:
train_feat

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [12]:
train_feat.shape

(7613, 18800)

In [13]:
test_feat.shape

(3262, 18800)

## Training classifier on test sparse matrix.

In [15]:
from catboost import CatBoostClassifier
model=CatBoostClassifier()

model.fit(train_feat,train_label)

predicted_label=model.predict(test_feat)
print('Process is done')



Learning rate set to 0.024511
0:	learn: 0.6898555	total: 852ms	remaining: 14m 10s
1:	learn: 0.6870028	total: 1000ms	remaining: 8m 18s
2:	learn: 0.6838897	total: 1.1s	remaining: 6m 4s
3:	learn: 0.6811982	total: 1.2s	remaining: 4m 57s
4:	learn: 0.6787576	total: 1.3s	remaining: 4m 18s
5:	learn: 0.6764045	total: 1.39s	remaining: 3m 51s
6:	learn: 0.6732560	total: 1.49s	remaining: 3m 31s
7:	learn: 0.6709065	total: 1.59s	remaining: 3m 17s
8:	learn: 0.6687499	total: 1.69s	remaining: 3m 6s
9:	learn: 0.6662079	total: 1.79s	remaining: 2m 57s
10:	learn: 0.6635844	total: 1.89s	remaining: 2m 49s
11:	learn: 0.6612514	total: 1.99s	remaining: 2m 43s
12:	learn: 0.6588783	total: 2.09s	remaining: 2m 38s
13:	learn: 0.6566666	total: 2.19s	remaining: 2m 34s
14:	learn: 0.6547864	total: 2.29s	remaining: 2m 30s
15:	learn: 0.6530507	total: 2.39s	remaining: 2m 27s
16:	learn: 0.6514629	total: 2.49s	remaining: 2m 23s
17:	learn: 0.6499658	total: 2.59s	remaining: 2m 21s
18:	learn: 0.6481679	total: 2.69s	remaining: 2m

158:	learn: 0.5511324	total: 17.5s	remaining: 1m 32s
159:	learn: 0.5508172	total: 17.6s	remaining: 1m 32s
160:	learn: 0.5503116	total: 17.7s	remaining: 1m 32s
161:	learn: 0.5499077	total: 17.8s	remaining: 1m 32s
162:	learn: 0.5495330	total: 17.9s	remaining: 1m 31s
163:	learn: 0.5492217	total: 18s	remaining: 1m 31s
164:	learn: 0.5489506	total: 18.1s	remaining: 1m 31s
165:	learn: 0.5486009	total: 18.2s	remaining: 1m 31s
166:	learn: 0.5482338	total: 18.3s	remaining: 1m 31s
167:	learn: 0.5478913	total: 18.4s	remaining: 1m 31s
168:	learn: 0.5476367	total: 18.5s	remaining: 1m 31s
169:	learn: 0.5473084	total: 18.6s	remaining: 1m 30s
170:	learn: 0.5471447	total: 18.7s	remaining: 1m 30s
171:	learn: 0.5468083	total: 18.8s	remaining: 1m 30s
172:	learn: 0.5462813	total: 18.9s	remaining: 1m 30s
173:	learn: 0.5459231	total: 19s	remaining: 1m 30s
174:	learn: 0.5456115	total: 19.1s	remaining: 1m 30s
175:	learn: 0.5451714	total: 19.2s	remaining: 1m 30s
176:	learn: 0.5448765	total: 19.3s	remaining: 1m 2

314:	learn: 0.5104477	total: 33.6s	remaining: 1m 13s
315:	learn: 0.5101822	total: 33.7s	remaining: 1m 13s
316:	learn: 0.5100194	total: 33.8s	remaining: 1m 12s
317:	learn: 0.5097875	total: 34s	remaining: 1m 12s
318:	learn: 0.5096200	total: 34s	remaining: 1m 12s
319:	learn: 0.5094168	total: 34.1s	remaining: 1m 12s
320:	learn: 0.5091832	total: 34.3s	remaining: 1m 12s
321:	learn: 0.5089197	total: 34.4s	remaining: 1m 12s
322:	learn: 0.5087126	total: 34.4s	remaining: 1m 12s
323:	learn: 0.5085142	total: 34.6s	remaining: 1m 12s
324:	learn: 0.5082811	total: 34.6s	remaining: 1m 11s
325:	learn: 0.5081208	total: 34.8s	remaining: 1m 11s
326:	learn: 0.5078881	total: 34.8s	remaining: 1m 11s
327:	learn: 0.5076419	total: 34.9s	remaining: 1m 11s
328:	learn: 0.5073728	total: 35s	remaining: 1m 11s
329:	learn: 0.5071006	total: 35.1s	remaining: 1m 11s
330:	learn: 0.5068905	total: 35.2s	remaining: 1m 11s
331:	learn: 0.5066695	total: 35.3s	remaining: 1m 11s
332:	learn: 0.5063989	total: 35.4s	remaining: 1m 10s

475:	learn: 0.4754111	total: 49.8s	remaining: 54.8s
476:	learn: 0.4752646	total: 49.9s	remaining: 54.7s
477:	learn: 0.4750969	total: 50s	remaining: 54.6s
478:	learn: 0.4748667	total: 50.1s	remaining: 54.5s
479:	learn: 0.4746510	total: 50.2s	remaining: 54.4s
480:	learn: 0.4744466	total: 50.3s	remaining: 54.3s
481:	learn: 0.4742454	total: 50.4s	remaining: 54.1s
482:	learn: 0.4740635	total: 50.5s	remaining: 54s
483:	learn: 0.4738966	total: 50.6s	remaining: 53.9s
484:	learn: 0.4736864	total: 50.7s	remaining: 53.8s
485:	learn: 0.4735218	total: 50.8s	remaining: 53.7s
486:	learn: 0.4733368	total: 50.9s	remaining: 53.6s
487:	learn: 0.4731806	total: 51s	remaining: 53.5s
488:	learn: 0.4729713	total: 51.1s	remaining: 53.4s
489:	learn: 0.4727732	total: 51.2s	remaining: 53.3s
490:	learn: 0.4726087	total: 51.3s	remaining: 53.2s
491:	learn: 0.4723576	total: 51.4s	remaining: 53s
492:	learn: 0.4721217	total: 51.5s	remaining: 52.9s
493:	learn: 0.4719108	total: 51.6s	remaining: 52.8s
494:	learn: 0.471692

635:	learn: 0.4480165	total: 1m 6s	remaining: 38s
636:	learn: 0.4478146	total: 1m 6s	remaining: 37.9s
637:	learn: 0.4476845	total: 1m 6s	remaining: 37.8s
638:	learn: 0.4474986	total: 1m 6s	remaining: 37.7s
639:	learn: 0.4473323	total: 1m 6s	remaining: 37.5s
640:	learn: 0.4471622	total: 1m 6s	remaining: 37.4s
641:	learn: 0.4469729	total: 1m 6s	remaining: 37.3s
642:	learn: 0.4467733	total: 1m 7s	remaining: 37.2s
643:	learn: 0.4465998	total: 1m 7s	remaining: 37.1s
644:	learn: 0.4464605	total: 1m 7s	remaining: 37s
645:	learn: 0.4463083	total: 1m 7s	remaining: 36.9s
646:	learn: 0.4461440	total: 1m 7s	remaining: 36.8s
647:	learn: 0.4459538	total: 1m 7s	remaining: 36.7s
648:	learn: 0.4458955	total: 1m 7s	remaining: 36.6s
649:	learn: 0.4457380	total: 1m 7s	remaining: 36.5s
650:	learn: 0.4456626	total: 1m 7s	remaining: 36.4s
651:	learn: 0.4454294	total: 1m 7s	remaining: 36.3s
652:	learn: 0.4452850	total: 1m 8s	remaining: 36.2s
653:	learn: 0.4451523	total: 1m 8s	remaining: 36s
654:	learn: 0.4449

791:	learn: 0.4277490	total: 1m 22s	remaining: 21.7s
792:	learn: 0.4276453	total: 1m 22s	remaining: 21.5s
793:	learn: 0.4275273	total: 1m 22s	remaining: 21.4s
794:	learn: 0.4273817	total: 1m 22s	remaining: 21.3s
795:	learn: 0.4272544	total: 1m 22s	remaining: 21.2s
796:	learn: 0.4272027	total: 1m 22s	remaining: 21.1s
797:	learn: 0.4270786	total: 1m 23s	remaining: 21s
798:	learn: 0.4269428	total: 1m 23s	remaining: 20.9s
799:	learn: 0.4268717	total: 1m 23s	remaining: 20.8s
800:	learn: 0.4267290	total: 1m 23s	remaining: 20.7s
801:	learn: 0.4265889	total: 1m 23s	remaining: 20.6s
802:	learn: 0.4265303	total: 1m 23s	remaining: 20.5s
803:	learn: 0.4263819	total: 1m 23s	remaining: 20.4s
804:	learn: 0.4263277	total: 1m 23s	remaining: 20.3s
805:	learn: 0.4261835	total: 1m 23s	remaining: 20.2s
806:	learn: 0.4260693	total: 1m 23s	remaining: 20.1s
807:	learn: 0.4259481	total: 1m 24s	remaining: 20s
808:	learn: 0.4259055	total: 1m 24s	remaining: 19.9s
809:	learn: 0.4257681	total: 1m 24s	remaining: 19.

948:	learn: 0.4108705	total: 1m 38s	remaining: 5.29s
949:	learn: 0.4107767	total: 1m 38s	remaining: 5.18s
950:	learn: 0.4106759	total: 1m 38s	remaining: 5.08s
951:	learn: 0.4105639	total: 1m 38s	remaining: 4.97s
952:	learn: 0.4104497	total: 1m 38s	remaining: 4.87s
953:	learn: 0.4103279	total: 1m 38s	remaining: 4.77s
954:	learn: 0.4101673	total: 1m 38s	remaining: 4.66s
955:	learn: 0.4100735	total: 1m 39s	remaining: 4.56s
956:	learn: 0.4099535	total: 1m 39s	remaining: 4.46s
957:	learn: 0.4098594	total: 1m 39s	remaining: 4.35s
958:	learn: 0.4097564	total: 1m 39s	remaining: 4.25s
959:	learn: 0.4096500	total: 1m 39s	remaining: 4.14s
960:	learn: 0.4096137	total: 1m 39s	remaining: 4.04s
961:	learn: 0.4095127	total: 1m 39s	remaining: 3.94s
962:	learn: 0.4093805	total: 1m 39s	remaining: 3.83s
963:	learn: 0.4093475	total: 1m 39s	remaining: 3.73s
964:	learn: 0.4092599	total: 1m 39s	remaining: 3.63s
965:	learn: 0.4092198	total: 1m 40s	remaining: 3.52s
966:	learn: 0.4091497	total: 1m 40s	remaining:

In [16]:
len(predicted_label)

3262

## Calculating model accuracy.

In [None]:
from sklearn.model_selection import cross_val_score
model_accuracy=cross_val_score(estimator=model,X=train_feat,y=train_label,cv=20)
 
model_accuracy*100 

## Submission file

In [None]:
r=pd.DataFrame(predicted_label)
r.to_excel('sub.xlsx')
 