In [84]:
import warnings
warnings.filterwarnings('ignore')

In [85]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [86]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_sub = pd.read_csv('sample_submission.csv')

In [87]:
df_train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [88]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [89]:
df_train['keyword'].unique()

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'der

In [90]:
df_train['location'].unique()

array([nan, 'Birmingham', 'Est. September 2012 - Bristol', ...,
       'Vancouver, Canada', 'London ', 'Lincoln'],
      shape=(3342,), dtype=object)

In [91]:
df_train.drop(['id', 'location'], axis = 1, inplace = True)
df_test.drop(['id', 'location'], axis = 1, inplace = True)

In [92]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [93]:
tfidf_tx = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_kw = TfidfVectorizer(stop_words='english')

In [94]:
X_kw = tfidf_kw.fit_transform(df_train['keyword'].fillna(''))

In [95]:
X_tx = tfidf_tx.fit_transform(df_train['text'])

In [96]:
from scipy.sparse import hstack

In [97]:
X = hstack([X_tx, X_kw])
y = df_train['target']

In [98]:
X_test_text = tfidf_tx.transform(df_test['text'])

In [99]:
X_test_kw = tfidf_kw.transform(df_test['keyword'].fillna(''))

In [100]:
X_test = hstack([X_test_text, X_test_kw])

In [101]:
from sklearn.linear_model import LogisticRegression

In [102]:
from sklearn.model_selection import GridSearchCV

In [103]:
model = LogisticRegression(max_iter=1000)

In [104]:
params = {
    'C':[0.01,0.1,1],
    'penalty' : ['l1', 'l2', 'elasticnet', None],
    'l1_ratio' :[0.2,0.45,0.7],
    'solver' : ['liblinear', 'newton-cg', 'sag', 'saga', 'lbfgs']
}

In [106]:
final_model = GridSearchCV(model, params, cv = 3, verbose = 2, n_jobs=-1)

In [107]:
final_model.fit(X,y)

Fitting 3 folds for each of 180 candidates, totalling 540 fits


0,1,2
,estimator,LogisticRegre...max_iter=1000)
,param_grid,"{'C': [0.01, 0.1, ...], 'l1_ratio': [0.2, 0.45, ...], 'penalty': ['l1', 'l2', ...], 'solver': ['liblinear', 'newton-cg', ...]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'saga'
,max_iter,1000


In [109]:
y_pred = final_model.predict(X_test)

In [110]:
df_sub['target'] = y_pred

In [111]:
df_sub.to_csv('sub3.csv', index=False)