<a href="https://www.kaggle.com/code/olgaianina/disastertweets-with-tfidf-sgd-classifier?scriptVersionId=112031560" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#import TfidfVectorizer for convertion text to a matrix of TF-IDF features.
from sklearn.feature_extraction.text import TfidfVectorizer
#import classifier algorithm
from sklearn.linear_model import SGDClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
#Read data
train_data = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_data = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [3]:
#Let's look at train data, we see many NaN-values in columns "keyword" and "location".
train_data.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
#There is same situation in the test data.
test_data.head(5)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
#Let's replace NaN-values to blank string ("")
train_data['keyword'] = train_data['keyword'].replace(np.NaN,"")
train_data['location'] = train_data['location'].replace(np.NaN,"")
test_data['keyword'] = test_data['keyword'].replace(np.NaN,"")
test_data['location'] = test_data['location'].replace(np.NaN,"")

In [6]:
#Now we delete hyperlinks from texts because of no important information. 
#delete links "http://t.co..." from text
train_data['text'] = train_data['text'].str.replace(r"http://\S*","",regex=True)
test_data['text'] = test_data['text'].str.replace(r"http://\S*","",regex=True)
#delete username links @aaa
train_data['text'] = train_data['text'].str.replace(r"@\S*","",regex=True)
test_data['text'] = test_data['text'].str.replace(r"@\S*","",regex=True)

In [7]:
#Lets's combine columns in one.
train_data['text'] = train_data['keyword']+" "+train_data['location']+" "+train_data['text']
test_data['text'] = test_data['keyword']+" "+test_data['location']+" "+test_data['text']
train_data.drop(['keyword','location'], axis=1, inplace=True)
test_data.drop(['keyword','location'], axis=1, inplace=True)

In [8]:
train_data.head(5)

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' ar...,1
3,6,"13,000 people receive #wildfires evacuation ...",1
4,7,Just got sent this photo from Ruby #Alaska a...,1


In [9]:
#Save target and id columns in the pandas series and drop them from train and test data.
train_y = train_data['target']
test_id = test_data['id']
train_data.drop(['target','id'], axis=1, inplace=True)
test_data.drop('id', axis=1, inplace=True)

In [10]:
print("size of train data =", train_data.shape, "\nsize of test data =", test_data.shape)

size of train data = (7613, 1) 
size of test data = (3263, 1)


In [11]:
#Let's transform data to TF-IDF vectors. We will use whole words and groups by several characters (n-grams) from 2 to 7 symbols.
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2,7), smooth_idf=False, use_idf=False, strip_accents='unicode')
train_data = vectorizer.fit_transform(train_data['text'])
test_data = vectorizer.transform(test_data['text'])

In [12]:
#The number of columns has increased.
print("size of train data =", train_data.shape, "\nsize of test data =", test_data.shape)

size of train data = (7613, 272706) 
size of test data = (3263, 272706)


In [13]:
#SGDClassifier is a linear classifier with SGD training. 
#Even if we don't configure the hyperparameters and leave them by default values, the algorithm will still work very well.
sgd = SGDClassifier(random_state=0, n_jobs=-1)
sgd.fit(train_data, train_y)
y_pred_sgd = sgd.predict(test_data)
print("accuracy score:", round(sgd.score(train_data, train_y)*100,2))

accuracy score: 88.47


In [14]:
result_data = pd.DataFrame()
result_data['id']=test_id
result_data['target'] = y_pred_sgd
result_data.to_csv("submission.csv", index=False)