# Notebook oficial - TP Datos

In [1]:
# Importando librerias
import pandas as pd
import numpy as np
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime
import warnings

import re
import nltk

from sklearn.feature_extraction.text import CountVectorizer

#Modelos
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression

In [2]:
#Carga del dataset
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
#Dimension
print('La dimension del dataset es: ',df_train.shape[0], 'registros,', df_train.shape[1],'columnas')
# Vista de los primeros registros
df_train.head(5)
# Data:
# id - identificador unico para cada tweet
# keyword - un keyword para el tweet (podría faltar)
# location - ubicación desde donde fue enviado (podría no estar)
# text - el texto del tweet
# target - indica si se trata de un desastre real (1) o no (0)

La dimension del dataset es:  7613 registros, 5 columnas


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
#Definición de tipos
df_train['id'] = df_train['id'].astype(int)
df_train['keyword'] = df_train['keyword'].fillna(value = "noKeyword").astype('object')
df_train['location'] = df_train['location'].astype('object')
df_train['text'] = df_train['text'].astype('object')
df_train['target'] = df_train['target'].astype('bool')
df_test['id'] = df_test['id'].astype(int)
df_test['keyword'] = df_test['keyword'].fillna(value = "noKeyword").astype('object')
df_test['location'] = df_test['location'].astype('object')
df_test['text'] = df_test['text'].astype('object')

## Filtrado de datos

In [4]:
#Elimino duplicados
df_train.drop_duplicates(inplace=True)

In [5]:
df_train.describe(include="all")

Unnamed: 0,id,keyword,location,text,target
count,7613.0,7613,5080,7613,7613
unique,,222,3341,7503,2
top,,noKeyword,USA,11-Year-Old Boy Charged With Manslaughter of T...,False
freq,,61,104,10,4342
mean,5441.934848,,,,
std,3137.11609,,,,
min,1.0,,,,
25%,2734.0,,,,
50%,5408.0,,,,
75%,8146.0,,,,


In [6]:
#Elimino columns que no seran usadas en el analisis
df_train = df_train.drop(['location'],axis=1)
#df_test = df_test.drop(['location'],axis=1)
df_train.describe(include="all")

Unnamed: 0,id,keyword,text,target
count,7613.0,7613,7613,7613
unique,,222,7503,2
top,,noKeyword,11-Year-Old Boy Charged With Manslaughter of T...,False
freq,,61,10,4342
mean,5441.934848,,,
std,3137.11609,,,
min,1.0,,,
25%,2734.0,,,
50%,5408.0,,,
75%,8146.0,,,


## Preprocesamiento de datos

### Limpieza de los text

In [7]:
def clean_text(text):
    text = text.lower()
    text = re.sub('\n', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    return text

In [8]:
df_train_res = df_train.copy()
df_test_res = df_test.copy()

### Preprocesamiento


In [9]:
def pre_process_text(text):

    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    processed_text = clean_text(text)
    processed_text = tokenizer.tokenize(processed_text) #Esto nos permite limpiar un poco mas el texto
    processed_text = ' '.join(processed_text)
    
    return processed_text

In [10]:
df_train_res['text'] = df_train_res['text'].apply(lambda x : pre_process_text(x))
df_test_res['text'] = df_test_res['text'].apply(lambda x : pre_process_text(x))
df_test_res

Unnamed: 0,id,keyword,location,text
0,0,noKeyword,,just happened a terrible car crash
1,2,noKeyword,,heard about earthquake is different cities sta...
2,3,noKeyword,,there is a forest fire at spot pond geese are ...
3,9,noKeyword,,apocalypse lighting spokane wildfires
4,11,noKeyword,,typhoon soudelor kills in china and taiwan
...,...,...,...,...
3258,10861,noKeyword,,earthquake safety los angeles ûò safety fasten...
3259,10865,noKeyword,,storm in ri worse than last hurricane my city ...
3260,10868,noKeyword,,green line derailment in chicago http t co utb...
3261,10874,noKeyword,,meg issues hazardous weather outlook hwo http ...


### Vectorizacion

In [11]:
vectorizer = CountVectorizer()

df_train_x = vectorizer.fit_transform(df_train_res['text'])
df_test_x = vectorizer.transform(df_test_res['text']) #Considero solo text

## Algoritmos empleados

In [12]:
#Preparacion
#rf_model = RandomForestRegressor()
model = LogisticRegression(max_iter=1000)
model.fit(df_train_x, df_train.target)

LogisticRegression(max_iter=1000)

In [13]:
def convertStates(x):
    if(x == True):
        return 1
    else:
        return 0

In [14]:
# Prediccion
pred = pd.DataFrame()
pred['id'] = df_test['id']
pred_data = model.predict(df_test_x)
pred['target'] = pred_data.tolist()
pred['target'] = pred['target'].apply(lambda x : convertStates(x))

## Conversion a csv

In [15]:
pred.to_csv('results/result2.csv',index=False)