In [1]:
!pip install contractions
!pip install emot
!pip install num2words



#Data preparation

In [2]:
import pandas as pd
import numpy as np
#Obtaining the datasets
train = pd.read_csv('train.csv')
final_test=pd.read_csv('test.csv')
# Some quick visualizations
print(train.head())
print(final_test.head())

   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  
   id keyword location                                               text
0   0     NaN      NaN                 Just happened a terrible car crash
1   2     NaN      NaN  Heard about #earthquake is different cities, s...
2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
4  11     NaN      NaN      Typhoon Soudelor kills 28 in China and Taiwan


#Pre Processing

In [3]:
import nltk
from nltk.corpus import gutenberg, stopwords, wordnet
import contractions
from nltk.tokenize import word_tokenize
import string
from nltk.stem import WordNetLemmatizer
import re
from bs4 import BeautifulSoup
from num2words import num2words
from emot.emo_unicode import UNICODE_EMOJI #for EMOJIS
from emot.emo_unicode import EMOTICONS_EMO # For EMOTICONS
nltk.download('gutenberg')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
def lower_case(text):
  return text.lower()
def rem_lines(text):
    return text.strip().replace('\n', ' ')
def rem_tags(text):
  return BeautifulSoup(text, "html.parser").get_text()
def fix_ct(text):
    return contractions.fix(text)
def rem_stopwords(text):
  stop_words = set(stopwords.words('english'))
  return [word for word in word_tokenize(text) if not word in stop_words]
def rem_punct(text):
    no_punct = [w.translate(str.maketrans('', '', string.punctuation)) for w in word_tokenize(text)]
    return [word for word in no_punct if word.isalpha()]
def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()
    wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
    pos_tagged_text = nltk.pos_tag(text.split())
    return ' '.join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])
def to_number(text):
    return(re.sub(r"(\d+)", lambda x: num2words(int(x.group(0))), text))
def convert_emoticons(text):
    for emot in EMOTICONS_EMO:
        text = text.replace(emot, EMOTICONS_EMO[emot]+" ".replace("",""))
    return text
def convert_emojis(text):
    for emot in UNICODE_EMOJI:
        text = text.replace(emot,UNICODE_EMOJI[emot]+" ".replace("",""))
    return text
def clean_text(text):
  text = text.lower() # convert to lowercase
  text = re.sub(r'[^a-z0-9\s]', '', text) # remove non-alphanumeric characters
  return text

In [5]:
#Removing Nan
train.keyword.fillna('', inplace=True)
train.location.fillna('', inplace=True)
final_test.keyword.fillna('', inplace=True)
final_test.location.fillna('', inplace=True)
print(train.head())
print(final_test.head())

   id keyword location                                               text  \
0   1                   Our Deeds are the Reason of this #earthquake M...   
1   4                              Forest fire near La Ronge Sask. Canada   
2   5                   All residents asked to 'shelter in place' are ...   
3   6                   13,000 people receive #wildfires evacuation or...   
4   7                   Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  
   id keyword location                                               text
0   0                                  Just happened a terrible car crash
1   2                   Heard about #earthquake is different cities, s...
2   3                   there is a forest fire at spot pond, geese are...
3   9                            Apocalypse lighting. #Spokane #wildfires
4  11                       Typhoon Soudelor kills 28 in China and Taiwan


In [6]:
# Take out the id and target columns
features_train = train.drop(columns=['target','id'] )
features_test = final_test.drop(columns=['id'] )
# Concatenate all
train['features'] = features_train.astype(str).apply(' '.join, axis=1)
final_test['features']=features_test.astype(str).apply(' '.join, axis=1)
print(train['features'])
print(final_test['features'])

0         Our Deeds are the Reason of this #earthquake...
1                  Forest fire near La Ronge Sask. Canada
2         All residents asked to 'shelter in place' ar...
3         13,000 people receive #wildfires evacuation ...
4         Just got sent this photo from Ruby #Alaska a...
                              ...                        
7608      Two giant cranes holding a bridge collapse i...
7609      @aria_ahrary @TheTawniest The out of control...
7610      M1.94 [01:04 UTC]?5km S of Volcano Hawaii. h...
7611      Police investigating after an e-bike collide...
7612      The Latest: More Homes Razed by Northern Cal...
Name: features, Length: 7613, dtype: object
0                      Just happened a terrible car crash
1         Heard about #earthquake is different cities,...
2         there is a forest fire at spot pond, geese a...
3                Apocalypse lighting. #Spokane #wildfires
4           Typhoon Soudelor kills 28 in China and Taiwan
                            

In [7]:
#cleaning train
train['features']=train['features'].astype(str).apply(lower_case)
#train['features']=train['features'].astype(str).apply(rem_lines)
train['features']=train['features'].astype(str).apply(rem_tags)
train['features']=train['features'].astype(str).apply(convert_emoticons)
train['features']=train['features'].astype(str).apply(convert_emojis)
#train['features']=train['features'].astype(str).apply(fix_ct)
#train['features']=train['features'].astype(str).apply(rem_stopwords)
#train['features']=train['features'].astype(str).apply(rem_punct)
#train['features']=train['features'].astype(str).apply(to_number)
#train['features']=train['features'].astype(str).apply(lemmatize_words)
#cleaning test
final_test['features']=final_test['features'].astype(str).apply(lower_case)
#final_test['features']=final_test['features'].astype(str).apply(rem_lines)
final_test['features']=final_test['features'].astype(str).apply(rem_tags)
final_test['features']=final_test['features'].astype(str).apply(convert_emoticons)
final_test['features']=final_test['features'].astype(str).apply(convert_emojis)
#final_test['features']=final_test['features'].astype(str).apply(fix_ct)
#final_test['features']=final_test['features'].astype(str).apply(rem_stopwords)
#final_test['features']=final_test['features'].astype(str).apply(rem_punct)
#final_test['features']=final_test['features'].astype(str).apply(to_number)
#final_test['features']=final_test['features'].astype(str).apply(lemmatize_words)
print(train['features'])
print(final_test['features'])

  return BeautifulSoup(text, "html.parser").get_text()


0         our deeds are the reason of this #earthquake...
1                  forest fire near la ronge sask. canada
2         all residents asked to 'shelter in place' ar...
3         13,000 people receive #wildfires evacuation ...
4         just got sent this photo from ruby #alaska a...
                              ...                        
7608      two giant cranes holding a bridge collapse i...
7609      @aria_ahrary @thetawniest the out of control...
7610      m1.94 [01:04 utc]?5km s of volcano hawaii. h...
7611      police investigating after an e-bike collide...
7612      the latest: more homes razed by northern cal...
Name: features, Length: 7613, dtype: object
0                      just happened a terrible car crash
1         heard about #earthquake is different cities,...
2         there is a forest fire at spot pond, geese a...
3                apocalypse lighting. #spokane #wildfires
4           typhoon soudelor kills 28 in china and taiwan
                            

In [8]:
# Training data preparation
from sklearn.model_selection import train_test_split
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(train['features'], train['target'], test_size=0.2, random_state=42)
#quick visualization
print(X_train)
print(X_test)

4996    military texas courageous and honest analysis ...
3263    engulfed  @zachzaidman @670thescore wld b a sh...
4907    massacre cottonwood arizona tell @barackobama ...
2855    drought spokane, wa worried about how the ca d...
4716    lava medan,indonesia @youngheroesid lava blast...
                              ...                        
5226    obliteration merica! @eganator2000 there aren'...
5390    panic  just had a panic attack bc i don't have...
860     blood  omron hem-712c automatic blood pressure...
7603      officials say a quarantine is in place at an...
7270    whirlwind stamford & cork (& shropshire) i mov...
Name: features, Length: 6090, dtype: object
2644    destruction  so you have a new weapon that can...
2227    deluge  the f$&@ing things i do for #gishwhes ...
5448    police uk dt @georgegalloway: rt @galloway4may...
132     aftershock  aftershock back to school kick off...
6845    trauma montgomery county, md in response to tr...
                            

In [9]:
#Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
#Vectorizer
vectorizer = TfidfVectorizer(min_df=0., max_df=1., norm='l2',
 use_idf=True, smooth_idf=True)
#Training data vectorization
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
#Problem vectorization
final_test_vec=vectorizer.transform(final_test['features'].astype(str))

#Training

In [10]:
#Some models
from sklearn.svm import LinearSVC as SVM
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import classification_report as CR
from sklearn.naive_bayes import BernoulliNB as NB
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.linear_model import Perceptron as PT
#initial models configuration
svm=SVM(penalty='l2',max_iter=500,C=1,random_state=42)
svc=SVC(max_iter=500,C=1,random_state=42)
lr=LR(max_iter=500)
nb=NB(alpha=.1)
rfc=RFC(n_estimators=10,random_state=42)
pt=PT(tol=1e-3, random_state=42)
#array of models
models=[svm,lr, nb, rfc, pt]
#array of results
pred=[]

In [11]:
#Training with each model
from sklearn.metrics import accuracy_score, f1_score
def fit(x_train,y_train,x_test,y_test):
  for model in models:
    model.fit(x_train,y_train)
    print(model)
    prediction=(model.predict(x_test))
    print(CR(y_test,prediction))
    print("f1:",f1_score(y_test,prediction))
    print("accuracy:",accuracy_score(y_test,prediction))
    pred.append(prediction)
#function execution
fit(X_train_vec,y_train,X_test_vec,y_test)

LinearSVC(C=1, max_iter=500, random_state=42)
              precision    recall  f1-score   support

           0       0.81      0.81      0.81       874
           1       0.75      0.74      0.74       649

    accuracy                           0.78      1523
   macro avg       0.78      0.78      0.78      1523
weighted avg       0.78      0.78      0.78      1523

f1: 0.7416602017067494
accuracy: 0.7813525935653316
LogisticRegression(max_iter=500)
              precision    recall  f1-score   support

           0       0.80      0.85      0.82       874
           1       0.78      0.71      0.74       649

    accuracy                           0.79      1523
   macro avg       0.79      0.78      0.78      1523
weighted avg       0.79      0.79      0.79      1523

f1: 0.7447833065810594
accuracy: 0.7912015758371634
BernoulliNB(alpha=0.1)
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       874
           1       0.76      0.

In [12]:
#Reconfiguring LinearSVC as it showed to be the best option
svm=SVM(penalty='l2',max_iter=2000,C=0.2,random_state=38,tol=1e-9)
svm.fit(X_train_vec,y_train)
y_pred=(svm.predict(X_test_vec))
print(CR(y_test,y_pred))
print("f1:",f1_score(y_test,y_pred))
print("accuracy:",accuracy_score(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.84      0.82       874
           1       0.77      0.72      0.75       649

    accuracy                           0.79      1523
   macro avg       0.79      0.78      0.78      1523
weighted avg       0.79      0.79      0.79      1523

f1: 0.7462211614956246
accuracy: 0.7905449770190414


#Execution

In [13]:
#prediction of the actual test data with linearSVC
y_pred=(svm.predict(final_test_vec))
#transforming into dataframe to facilitate csv creation

result_df=final_test.drop(columns=['keyword','location','text','features'] )
result_df['target']=y_pred
print(result_df.head())

   id  target
0   0       1
1   2       1
2   3       1
3   9       0
4  11       1


#CSV creation

In [14]:
from google.colab import files
#csv creation and download
file_name_csv = "submission_Jairo_Acevedo_17_processing_1_svm.csv"  # Nombre del archivo CSV de salida
result_df.to_csv(file_name_csv, index=False)  # index=False para omitir la columna de índices
files.download(file_name_csv)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>