In [1]:
import pandas as pd

tweets = pd.read_csv('train.csv')

In [2]:
tweets[tweets.target==0].text

15                                         What's up man?
16                                          I love fruits
17                                       Summer is lovely
18                                      My car is so fast
19                           What a goooooooaaaaaal!!!!!!
                              ...                        
7581    @engineshed Great atmosphere at the British Li...
7582    Cramer: Iger's 3 words that wrecked Disney's s...
7584    These boxes are ready to explode! Exploding Ki...
7587                                   Sirens everywhere!
7593    I just heard a really loud bang and everyone i...
Name: text, Length: 4342, dtype: object

In [3]:
pd.get_option('max_colwidth')

50

In [4]:
pd.set_option('max_colwidth', 280)

In [5]:
tweets[tweets.target==0]

Unnamed: 0,id,keyword,location,text,target
15,23,,,What's up man?,0
16,24,,,I love fruits,0
17,25,,,Summer is lovely,0
18,26,,,My car is so fast,0
19,28,,,What a goooooooaaaaaal!!!!!!,0
...,...,...,...,...,...
7581,10833,wrecked,Lincoln,@engineshed Great atmosphere at the British Lion gig tonight. Hearing is wrecked. http://t.co/oMNBAtJEAO,0
7582,10834,wrecked,,Cramer: Iger's 3 words that wrecked Disney's stock - CNBC http://t.co/N6RBnHMTD4,0
7584,10837,,,These boxes are ready to explode! Exploding Kittens finally arrived! gameofkittens #explodingkittensÛ_ https://t.co/TFGrAyuDC5,0
7587,10841,,,Sirens everywhere!,0


In [6]:
tweets[tweets.target==1]

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse into nearby homes http://t.co/STfMbbZFB5,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control wild fires in California even in the Northern part of the state. Very troubling.,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ,1
7611,10872,,,Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.,1


In [7]:
import plotly.graph_objs as go
import plotly.offline as py

z = {0:'Not disater', 1:'Disaster'}
data = [go.Bar(
            x = tweets.target.map(z).unique(),
            y = tweets.target.value_counts().values,
            marker= dict(colorscale='Jet',
                         color = tweets.target.value_counts().values
                        ),
            text='Text entries attributed to Author'
    )]

layout = go.Layout(
    title='Target variable distribution'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')

In [8]:
tweets.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [9]:
tweets = tweets.drop(['keyword', 'location', 'id'], axis=1)

In [10]:

durty = list()
for tweet in tweets.text:
    resp = re.findall(r"(@[A-Za-z0-9]+)", tweet)
    if resp: 
        durty+=resp
durty[:10]

NameError: name 're' is not defined

In [11]:
import nltk
from nltk import tokenize
import re

def treatment1(df, field):
    regex1 = r"(@[A-Za-z0-9]+)"
    regex2 = r"(\w+:\/\/\S+)"
    regex3 = r"^rt"
    regex4 = r"([^0-9A-Za-z \t])"
    
    clean_data = df[field].apply(lambda elem: re.sub(regex1+"|"+regex2+"|"+regex3+"|"+regex4, " ", elem))
    
    return clean_data.str.lower()

In [12]:
tweets['treated'] = treatment1(tweets, 'text')

In [13]:
from string import punctuation
stemmer = nltk.stem.PorterStemmer()

stopwords = nltk.corpus.stopwords.words("english")

punctuation = list()
for ponto in punctuation:
    punctuation.append(ponto)

punctuation_stopwords = punctuation + stopwords

def treatment2(df, field, stop):
    cleaned = df[field].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    return cleaned.apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

In [14]:
tweets['treated'] = treatment2(tweets, 'treated', punctuation_stopwords)

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tweets['treated'],tweets['target'],random_state = 0)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier

pipeline_sgd = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', SGDClassifier()),
])

model = pipeline_sgd.fit(X_train, y_train)

In [19]:
from sklearn.metrics import classification_report
y_predict = model.predict(X_test)
print(classification_report(y_test, y_predict))

precision    recall  f1-score   support

           0       0.81      0.85      0.83      1107
           1       0.78      0.72      0.75       797

    accuracy                           0.80      1904
   macro avg       0.79      0.79      0.79      1904
weighted avg       0.80      0.80      0.80      1904



In [21]:
test_data = pd.read_csv('test.csv')

submission_test_clean = test_data.copy()
submission_test_clean['text'] = treatment1(submission_test_clean, "text")
submission_test_clean = treatment2(submission_test_clean, "text", punctuation_stopwords)
submission_test_clean.head()

0                                     happen terribl car crash
1                heard earthquak differ citi stay safe everyon
2    forest fire spot pond gees flee across street cannot save
3                               apocalyps light spokan wildfir
4                        typhoon soudelor kill 28 china taiwan
Name: text, dtype: object

In [22]:
submission_test_pred = model.predict(submission_test_clean)

In [23]:
id_col = test_data['id']
submission_df_1 = pd.DataFrame({
                  "id": id_col, 
                  "target": submission_test_pred})
submission_df_1.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [24]:
submission_df_1.to_csv('submission_1.csv', index=False)