In [1]:
import pandas as pd
import json
pd.set_option('display.max_columns', None) #Quitamos limitación de columnas máximas a mostrar
!cd

C:\Users\User\Documents\Master Data Science\SocialBigData\Twitter


### Importamos los conjuntos de entrenamiento y test: 
##### Hay cuatro categorías:
    Tráfico = T
    Contaminación = C
    Ambos = A
    Ninguno = N

In [2]:
train_tweets = pd.read_excel("./Data/train_tweets.xlsx", header=0, index_col='_id')
train_tweets.head(2)

Unnamed: 0_level_0,created_at,text,Category,Unnamed: 4
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
791546620915486720,2016-10-27 07:46:35,#madrid alcanzado el escenario 1 del protocolo...,A,
793885278553317376,2016-11-02 18:39:34,#madrid carmena propone restricciones al trafi...,A,


In [3]:
test_tweets = pd.read_excel("./Data/test_tweets.xlsx", header=0, index_col='_id')
test_tweets.head(2)

Unnamed: 0_level_0,created_at,text,Category
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
801338536914022401,2016-11-23 08:16:10,#madrid marcha y concentracion ciclista por el...,A
842058467443630080,2017-03-15 17:02:37,nio eve: el coche autonomo (nivel 5) que se ca...,A


### Operaciones de tokenización, stemming y limpieza de texto:

In [4]:
%%time
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

cachedStopWords = stopwords.words("spanish")
stemmer = SnowballStemmer('spanish')

# Eliminamos carácteres no alfanuméricos:
# Eliminamos StopWords:
# Lematizamos:
train_tweets["SUMs"]=train_tweets["text"].apply(lambda x: re.sub(r'[^\w\s]','',str(x))).str.split()\
.apply(lambda x: list(set(filter(lambda y: y not in cachedStopWords,x))))\
.apply(lambda x: [stemmer.stem(token) for token in x])

test_tweets["SUMs"]=test_tweets["text"].apply(lambda x: re.sub(r'[^\w\s]','',str(x))).str.split()\
.apply(lambda x: list(set(filter(lambda y: y not in cachedStopWords,x))))\
.apply(lambda x: [stemmer.stem(token) for token in x])

Wall time: 1.86 s


In [5]:
train_tweets.sample(2)

Unnamed: 0_level_0,created_at,text,Category,Unnamed: 4,SUMs
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
751373952392593408,2016-07-08 11:14:44,#incdgt el 08-07-2016 09:00:00 tipo #obra (obr...,T,136.0,"[obras, 090000, m50, 08072016, pk33_6, kilomet..."
770233905106608132,2016-08-29 12:17:27,#incdgt el 29-08-2016 09:30:00 tipo #obra (man...,T,,"[crecient, coll, puent, d, manten, 29082016, i..."


### Creamos diccionario de palabras únicas y número de tweets en los que aparecen:

In [6]:
%%time
diccio = {}
for SUM in train_tweets.append(test_tweets)["SUMs"]:
    for word in SUM:
        if word in diccio.keys():
            diccio[word]+=1
        else: diccio[word]=1
print(len(diccio))

5460
Wall time: 29.1 ms


### Creamos diccionario de palabras con IDFs:

In [7]:
import numpy as np

idfs={}
n_docs = len(train_tweets.append(test_tweets))

for key in diccio.keys():
    idfs[key]= np.log10(n_docs/diccio[key])
print(len(idfs))

5460


In [8]:
idfs["trafic"]

2.2894774663446023

### Definimos función para crear vector de características para cada tweet:

In [9]:
def feat_vect_creat(vect, diccionario):
    """Creación del vector de características"""
    feat_vect = []
    for feat in diccionario.keys():
        if feat in vect:
            feat_vect.append(diccionario[feat])
        else:
            feat_vect.append(0)
    return feat_vect

### Creamos los vectores de características de cada tweet para los sets de entrenamiento y test:

In [10]:
%%time

train_tweets["feature_vector"] = train_tweets["SUMs"].map(lambda x: feat_vect_creat(x, idfs))
test_tweets["feature_vector"] = test_tweets["SUMs"].map(lambda x: feat_vect_creat(x, idfs))

Wall time: 2.35 s


In [11]:
train_tweets.head(2)

Unnamed: 0_level_0,created_at,text,Category,Unnamed: 4,SUMs,feature_vector
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
791546620915486720,2016-10-27 07:46:35,#madrid alcanzado el escenario 1 del protocolo...,A,,"[contaminacion, alcanz, protocol, restriccion,...","[1.52980962165, 3.19256745334, 2.11338620729, ..."
793885278553317376,2016-11-02 18:39:34,#madrid carmena propone restricciones al trafi...,A,,"[madridâ, httpstcotqybs8mv8c, restriccion, cen...","[0, 0, 0, 2.41441620295, 0, 0, 0, 0, 0, 2.2894..."


### Extraemos los vectores de características en listas para a continuación crear DataFames de características:

In [12]:
%%time

train_vectors = [vector for vector in train_tweets["feature_vector"].values]
test_vectors = [vector for vector in test_tweets["feature_vector"].values]

train_df = pd.DataFrame(train_vectors, columns=idfs.keys(), index=train_tweets.index)
test_df = pd.DataFrame(test_vectors, columns=idfs.keys(), index=test_tweets.index)

print(test_df.shape)

(222, 5460)
Wall time: 1.52 s


### Definimos funciones para obtener la ganancia de cada raíz

In [13]:
def entropy(attribute_data):
    """
    Calcula la entropía de cada palabra
    Recibe un vector y devuelve un flotante
    """
    _, val_freqs = np.unique(attribute_data, return_counts=True)
    # probabilities for each unique attribute value
    val_probs = val_freqs / len(attribute_data)
    return -val_probs.dot(np.log(val_probs))
print(entropy(train_df["posibl"]))
print(entropy(train_df["trafic"]))

0.00613552470748
0.0327412470058


In [14]:
def get_count_dict(data):
    """
    Devuelve valores únicos y su frecuencia en un diccionario, para cada vector (de raiz) proporcionado
    """
    data_values, data_freqs = np.unique(data, return_counts=True)
    return dict(zip(data_values, data_freqs))

In [15]:
def info_gain(attribute_data, labels):
    """
    Calcula la ganancia de información
    Recibe un vector de caracteristicas para una raíz, y un vector de etiquetas
    Devuelve la ganancia como valor flotante
    """
    attr_val_counts = get_count_dict(attribute_data)
    total_count = len(labels)
    EA = 0.0
    for attr_val, attr_val_count in attr_val_counts.items():
        EA += attr_val_count * entropy(labels[attribute_data == attr_val])


    return entropy(labels) - EA / total_count

print(info_gain(train_df["posibl"],train_tweets["Category"]))
print(info_gain(train_df["trafic"],train_tweets["Category"]))

0.00355915360218
0.0105690883726


### Creamos diccionario de ganancia de las raíces 

In [16]:
%%time
ganancias={}
for key in idfs.keys():
    ganancias[key]= info_gain(train_df[key],train_tweets["Category"])

Wall time: 10 s


### Creamos diccionario de las raíces con mayor ganancia
#### Utilizamos como criterio las que tienen ganancias por encima de la media

In [17]:
%%time
media = np.mean(list(ganancias.values()))
relevant_stems = {}
for key in ganancias.keys():
    if ganancias[key]>media:
        relevant_stems[key] = ganancias[key]
print(len(ganancias.keys()))
print(len(relevant_stems.keys()))

5460
595
Wall time: 1 ms


### Reconstruimos los vectores de características para sólo las palabras relevantes

In [18]:
%%time

#Creación de los vectores en los DataFrames
train_tweets["feature_vector"] = train_tweets["SUMs"].map(lambda x: feat_vect_creat(x, relevant_stems))
test_tweets["feature_vector"] = test_tweets["SUMs"].map(lambda x: feat_vect_creat(x, relevant_stems))

#Extracción de los vectores
train_vectors = [vector for vector in train_tweets["feature_vector"].values]
test_vectors = [vector for vector in test_tweets["feature_vector"].values]

print(len(train_vectors[0]))
print(len(test_vectors[0]))

#Creación de DataFrames de características
train_df = pd.DataFrame(train_vectors, columns=relevant_stems.keys(), index=train_tweets.index)
test_df = pd.DataFrame(test_vectors, columns=relevant_stems.keys(), index=test_tweets.index)

595
595
Wall time: 469 ms


In [19]:
print(train_df.shape)
train_df.head(2)

(1336, 595)


Unnamed: 0_level_0,contaminacion,alcanz,protocol,restriccion,1,posibl,escenari,28,httpstco6dho7ojtpf,trafic,madr,viern,madridâ,httpstcotqybs8mv8c,centr,carmen,resident,propon,anunci,httpstcot40hzmua2z,aparc,circulacion,12,delââ,httpstcoetldkovpdc,concieloazul,estoesmadridâ,conestaluz,sintraf,sinmaloshum,httpstcob8emmrht2i,distintivoambiental,mostol,coch,ecofriendly,dgt,fog,spain,carabany,httpstcody37a80vrn,niebl,mas,50,httpstcognoqnaw2jx,httpstcopem4r4veoj,contamin,pag,gratis,experient,aparcadondequ,primer,â,emov,httpstcoijuwqoabmt,nocontamin,feliz,httpstcozlrwhaupo,gran,diciembr,air,mejor,cierr,calid,32,parcial,viaâ,levels,ban,httpstcozvbyoz4jfi,from,half,to,pollution,due,the,city,be,will,tim,cars,cent,this,thatâ,first,of,high,ecologicosâ,parquimetr,subir,except,preci,httpstcoaoqmwdmx3w,buen,afin,part,sol,ahoramadr,regulacion,via,ide,qued,manuelacarmen,tragic,peatonaliz,gan,call,sky,demadridalciel,public,dias,ayunt,sig,llov,siempr,palac,gris,posicion,tendenci,trndnl,ser,httpstcok5dfqqcsew,ocup,espany,ahor,parqu,delâ,comun,âqu,anticontaminacion,veloc,m30,restring,activ,no2,zon,cas,sab,contaminacionâ,nivel,nuev,fri,pobrez,crisis,chin,blackandwhit,blancoynegr,in,alertacontaminacionep,for,ciel,sierr,vist,convert,asi,nub,boin,torrelodon,anochec,madrid,metr,precios,ciud,polucion,verd,vam,pur,si,puert,salud,hac,diput,rio,sinfiltr,san,dia,cos,aqui,nightâ,noch,nofilt,matader,nofilters,realmadr,madridmemol,sinfiltrosâ,nsinfiltr,hotel,alnatural,nofilterâ,sunset,atardec,curr,graci,calle30,megust,amanecer,mund,templ,debod,tard,byn,lluvi,i,lleg,sinfiltroâ,espectacul,com,arquitectur,monclo,chamberi,madridmol,arroyofresn,elcielodemadr,mirasierr,bychinygb,photobychinygb,real,palaci,jardin,pase,eurotravelâ,madridsinfiltr,luz,puestadesol,otony,retir,doming,ðÿ,ðÿœ,camp,primaver,color,final,sun,bonit,amanec,sunris,retaz,atardecer,obras,deâ,buenosdi,mont,pas,navid,unviajesinequipajeâ,fern,disfrut,ven,im,lov,natural,roz,henar,alcal,you,aeropuert,mad,estadi,bernabeu,santiag,hoy,segu,trabaj,instagram,filtr,travel,beautiful,dur,pint,roj,alcorcon,despu,boadill,precaucion,m40,23052016,incdgt,blanc,tip,decrecient,kilometr,crecient,alcobend,m100,pk25_0,27042016,kilom,accident,m45,pk0_0,17062016,25052016,pk9_0,a1,07072016,pk5_0,a2,100000,carril,a6,pk24_0,pk34_0,16062016,rey,amarill,071700,sebasti,pk17_0,decreci,090000,22112016,m505,pk20_0,general,obra,084200,pk10_0,pk43_0,23112016,080000,25042016,pk12_0,pk6_0,080200,13062016,20062016,18052016,26082016,d,manten,bici,m601,reasfalt,pk20_2,navacerr,093000,pk,a42,parl,m607,kil,tres,30112016,pk16_0,cant,ki,230000,m50,29042016,kilomet,14032016,pk18_0,pk8_3,pk32_0,20042016,a4,29062016,a3,080300,06052016,pk4_5,a5,pk29_0,21112016,25112016,pk7_0,03102016,29082016,223000,16032016,183300,pozuel,l,pk12_5,alarcon,pk28_0,21062016,17112016,pk33_0,14072016,pk14_0,17052016,torrejon,pk23_0,ardoz,31032016,pk13_2,27062016,23062016,30092016,contrari,01122016,pk11_0,02122016,k,18112016,09052016,madbikestatus,bicim,dispon,rivasvaciamadr,10052016,pk15_0,pk51_0,26072016,m11,29112016,averi,pk15_5,pk27_0,pk13_0,m423,valdemor,pk8_0,m503,071300,15072016,pk22_0,ap6,24062016,28112016,000000,pk21_0,pk4_0,29092016,12052016,01072016,guadalix,08072016,04102016,073200,m512,27052016,24052016,05102016,agustin,pk30_0,calz,senyalizacion,m222,valdaracet,20052016,181900,13072016,decr,26052016,28062016,pk49_5,13052016,13122016,20072016,pk19_0,05072016,creci,pk48_0,dgtmadr,12042016,pk9_5,11052016,m513,10112016,22062016,15032016,10082016,estadioâ,halamadr,instal,cest,vem,menu,fot,hech,cam,and,was,iberi,travelgram,cambi,turiste,that,all,httpstcolvso6nplxn,corr,fiest,tourismâ,business,egaylity,igersspain,world,at,calderon,barc,vicent,almuerz,pon,mil,luch,champions,verano2016,quer,padr,foll,parej,amor,hal,architectur,prid,gay,my,vip,phot,tom,travelblogg,chiringuitomadridn4,traveling,atlet,atleti,2017,listasdepuert,discotecasmadr,albertodelacru,restaur,mayor,santiagobernabeu,executiv,nsx,bas,httpstco0j4bplq2ln,job,account,vacation,homosexual,turism,famili,ðÿªðÿ,gust,plat,fashion,backstag,week,pued,cen,her,gastronomi,futbol,6,sex,2016,pid,with,mbfwmadr,ayer,quier,w,guauguauguap,lapeludemadr,dos,championsleag,ucl,style,by,instatravel,cr7,cristian,puertadealcal,travelâ,cocin,turist,orgull,we,gent,weekend,25,fcbarcelona_,peopl,put,conciert,work,last,cruz,diseny,m,santiagoâ,restaurant,lif,import,wanderlust,amateur,deport,ðÿðÿ,mod,cristianoronald,party,ronald
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1,Unnamed: 218_level_1,Unnamed: 219_level_1,Unnamed: 220_level_1,Unnamed: 221_level_1,Unnamed: 222_level_1,Unnamed: 223_level_1,Unnamed: 224_level_1,Unnamed: 225_level_1,Unnamed: 226_level_1,Unnamed: 227_level_1,Unnamed: 228_level_1,Unnamed: 229_level_1,Unnamed: 230_level_1,Unnamed: 231_level_1,Unnamed: 232_level_1,Unnamed: 233_level_1,Unnamed: 234_level_1,Unnamed: 235_level_1,Unnamed: 236_level_1,Unnamed: 237_level_1,Unnamed: 238_level_1,Unnamed: 239_level_1,Unnamed: 240_level_1,Unnamed: 241_level_1,Unnamed: 242_level_1,Unnamed: 243_level_1,Unnamed: 244_level_1,Unnamed: 245_level_1,Unnamed: 246_level_1,Unnamed: 247_level_1,Unnamed: 248_level_1,Unnamed: 249_level_1,Unnamed: 250_level_1,Unnamed: 251_level_1,Unnamed: 252_level_1,Unnamed: 253_level_1,Unnamed: 254_level_1,Unnamed: 255_level_1,Unnamed: 256_level_1,Unnamed: 257_level_1,Unnamed: 258_level_1,Unnamed: 259_level_1,Unnamed: 260_level_1,Unnamed: 261_level_1,Unnamed: 262_level_1,Unnamed: 263_level_1,Unnamed: 264_level_1,Unnamed: 265_level_1,Unnamed: 266_level_1,Unnamed: 267_level_1,Unnamed: 268_level_1,Unnamed: 269_level_1,Unnamed: 270_level_1,Unnamed: 271_level_1,Unnamed: 272_level_1,Unnamed: 273_level_1,Unnamed: 274_level_1,Unnamed: 275_level_1,Unnamed: 276_level_1,Unnamed: 277_level_1,Unnamed: 278_level_1,Unnamed: 279_level_1,Unnamed: 280_level_1,Unnamed: 281_level_1,Unnamed: 282_level_1,Unnamed: 283_level_1,Unnamed: 284_level_1,Unnamed: 285_level_1,Unnamed: 286_level_1,Unnamed: 287_level_1,Unnamed: 288_level_1,Unnamed: 289_level_1,Unnamed: 290_level_1,Unnamed: 291_level_1,Unnamed: 292_level_1,Unnamed: 293_level_1,Unnamed: 294_level_1,Unnamed: 295_level_1,Unnamed: 296_level_1,Unnamed: 297_level_1,Unnamed: 298_level_1,Unnamed: 299_level_1,Unnamed: 300_level_1,Unnamed: 301_level_1,Unnamed: 302_level_1,Unnamed: 303_level_1,Unnamed: 304_level_1,Unnamed: 305_level_1,Unnamed: 306_level_1,Unnamed: 307_level_1,Unnamed: 308_level_1,Unnamed: 309_level_1,Unnamed: 310_level_1,Unnamed: 311_level_1,Unnamed: 312_level_1,Unnamed: 313_level_1,Unnamed: 314_level_1,Unnamed: 315_level_1,Unnamed: 316_level_1,Unnamed: 317_level_1,Unnamed: 318_level_1,Unnamed: 319_level_1,Unnamed: 320_level_1,Unnamed: 321_level_1,Unnamed: 322_level_1,Unnamed: 323_level_1,Unnamed: 324_level_1,Unnamed: 325_level_1,Unnamed: 326_level_1,Unnamed: 327_level_1,Unnamed: 328_level_1,Unnamed: 329_level_1,Unnamed: 330_level_1,Unnamed: 331_level_1,Unnamed: 332_level_1,Unnamed: 333_level_1,Unnamed: 334_level_1,Unnamed: 335_level_1,Unnamed: 336_level_1,Unnamed: 337_level_1,Unnamed: 338_level_1,Unnamed: 339_level_1,Unnamed: 340_level_1,Unnamed: 341_level_1,Unnamed: 342_level_1,Unnamed: 343_level_1,Unnamed: 344_level_1,Unnamed: 345_level_1,Unnamed: 346_level_1,Unnamed: 347_level_1,Unnamed: 348_level_1,Unnamed: 349_level_1,Unnamed: 350_level_1,Unnamed: 351_level_1,Unnamed: 352_level_1,Unnamed: 353_level_1,Unnamed: 354_level_1,Unnamed: 355_level_1,Unnamed: 356_level_1,Unnamed: 357_level_1,Unnamed: 358_level_1,Unnamed: 359_level_1,Unnamed: 360_level_1,Unnamed: 361_level_1,Unnamed: 362_level_1,Unnamed: 363_level_1,Unnamed: 364_level_1,Unnamed: 365_level_1,Unnamed: 366_level_1,Unnamed: 367_level_1,Unnamed: 368_level_1,Unnamed: 369_level_1,Unnamed: 370_level_1,Unnamed: 371_level_1,Unnamed: 372_level_1,Unnamed: 373_level_1,Unnamed: 374_level_1,Unnamed: 375_level_1,Unnamed: 376_level_1,Unnamed: 377_level_1,Unnamed: 378_level_1,Unnamed: 379_level_1,Unnamed: 380_level_1,Unnamed: 381_level_1,Unnamed: 382_level_1,Unnamed: 383_level_1,Unnamed: 384_level_1,Unnamed: 385_level_1,Unnamed: 386_level_1,Unnamed: 387_level_1,Unnamed: 388_level_1,Unnamed: 389_level_1,Unnamed: 390_level_1,Unnamed: 391_level_1,Unnamed: 392_level_1,Unnamed: 393_level_1,Unnamed: 394_level_1,Unnamed: 395_level_1,Unnamed: 396_level_1,Unnamed: 397_level_1,Unnamed: 398_level_1,Unnamed: 399_level_1,Unnamed: 400_level_1,Unnamed: 401_level_1,Unnamed: 402_level_1,Unnamed: 403_level_1,Unnamed: 404_level_1,Unnamed: 405_level_1,Unnamed: 406_level_1,Unnamed: 407_level_1,Unnamed: 408_level_1,Unnamed: 409_level_1,Unnamed: 410_level_1,Unnamed: 411_level_1,Unnamed: 412_level_1,Unnamed: 413_level_1,Unnamed: 414_level_1,Unnamed: 415_level_1,Unnamed: 416_level_1,Unnamed: 417_level_1,Unnamed: 418_level_1,Unnamed: 419_level_1,Unnamed: 420_level_1,Unnamed: 421_level_1,Unnamed: 422_level_1,Unnamed: 423_level_1,Unnamed: 424_level_1,Unnamed: 425_level_1,Unnamed: 426_level_1,Unnamed: 427_level_1,Unnamed: 428_level_1,Unnamed: 429_level_1,Unnamed: 430_level_1,Unnamed: 431_level_1,Unnamed: 432_level_1,Unnamed: 433_level_1,Unnamed: 434_level_1,Unnamed: 435_level_1,Unnamed: 436_level_1,Unnamed: 437_level_1,Unnamed: 438_level_1,Unnamed: 439_level_1,Unnamed: 440_level_1,Unnamed: 441_level_1,Unnamed: 442_level_1,Unnamed: 443_level_1,Unnamed: 444_level_1,Unnamed: 445_level_1,Unnamed: 446_level_1,Unnamed: 447_level_1,Unnamed: 448_level_1,Unnamed: 449_level_1,Unnamed: 450_level_1,Unnamed: 451_level_1,Unnamed: 452_level_1,Unnamed: 453_level_1,Unnamed: 454_level_1,Unnamed: 455_level_1,Unnamed: 456_level_1,Unnamed: 457_level_1,Unnamed: 458_level_1,Unnamed: 459_level_1,Unnamed: 460_level_1,Unnamed: 461_level_1,Unnamed: 462_level_1,Unnamed: 463_level_1,Unnamed: 464_level_1,Unnamed: 465_level_1,Unnamed: 466_level_1,Unnamed: 467_level_1,Unnamed: 468_level_1,Unnamed: 469_level_1,Unnamed: 470_level_1,Unnamed: 471_level_1,Unnamed: 472_level_1,Unnamed: 473_level_1,Unnamed: 474_level_1,Unnamed: 475_level_1,Unnamed: 476_level_1,Unnamed: 477_level_1,Unnamed: 478_level_1,Unnamed: 479_level_1,Unnamed: 480_level_1,Unnamed: 481_level_1,Unnamed: 482_level_1,Unnamed: 483_level_1,Unnamed: 484_level_1,Unnamed: 485_level_1,Unnamed: 486_level_1,Unnamed: 487_level_1,Unnamed: 488_level_1,Unnamed: 489_level_1,Unnamed: 490_level_1,Unnamed: 491_level_1,Unnamed: 492_level_1,Unnamed: 493_level_1,Unnamed: 494_level_1,Unnamed: 495_level_1,Unnamed: 496_level_1,Unnamed: 497_level_1,Unnamed: 498_level_1,Unnamed: 499_level_1,Unnamed: 500_level_1,Unnamed: 501_level_1,Unnamed: 502_level_1,Unnamed: 503_level_1,Unnamed: 504_level_1,Unnamed: 505_level_1,Unnamed: 506_level_1,Unnamed: 507_level_1,Unnamed: 508_level_1,Unnamed: 509_level_1,Unnamed: 510_level_1,Unnamed: 511_level_1,Unnamed: 512_level_1,Unnamed: 513_level_1,Unnamed: 514_level_1,Unnamed: 515_level_1,Unnamed: 516_level_1,Unnamed: 517_level_1,Unnamed: 518_level_1,Unnamed: 519_level_1,Unnamed: 520_level_1,Unnamed: 521_level_1,Unnamed: 522_level_1,Unnamed: 523_level_1,Unnamed: 524_level_1,Unnamed: 525_level_1,Unnamed: 526_level_1,Unnamed: 527_level_1,Unnamed: 528_level_1,Unnamed: 529_level_1,Unnamed: 530_level_1,Unnamed: 531_level_1,Unnamed: 532_level_1,Unnamed: 533_level_1,Unnamed: 534_level_1,Unnamed: 535_level_1,Unnamed: 536_level_1,Unnamed: 537_level_1,Unnamed: 538_level_1,Unnamed: 539_level_1,Unnamed: 540_level_1,Unnamed: 541_level_1,Unnamed: 542_level_1,Unnamed: 543_level_1,Unnamed: 544_level_1,Unnamed: 545_level_1,Unnamed: 546_level_1,Unnamed: 547_level_1,Unnamed: 548_level_1,Unnamed: 549_level_1,Unnamed: 550_level_1,Unnamed: 551_level_1,Unnamed: 552_level_1,Unnamed: 553_level_1,Unnamed: 554_level_1,Unnamed: 555_level_1,Unnamed: 556_level_1,Unnamed: 557_level_1,Unnamed: 558_level_1,Unnamed: 559_level_1,Unnamed: 560_level_1,Unnamed: 561_level_1,Unnamed: 562_level_1,Unnamed: 563_level_1,Unnamed: 564_level_1,Unnamed: 565_level_1,Unnamed: 566_level_1,Unnamed: 567_level_1,Unnamed: 568_level_1,Unnamed: 569_level_1,Unnamed: 570_level_1,Unnamed: 571_level_1,Unnamed: 572_level_1,Unnamed: 573_level_1,Unnamed: 574_level_1,Unnamed: 575_level_1,Unnamed: 576_level_1,Unnamed: 577_level_1,Unnamed: 578_level_1,Unnamed: 579_level_1,Unnamed: 580_level_1,Unnamed: 581_level_1,Unnamed: 582_level_1,Unnamed: 583_level_1,Unnamed: 584_level_1,Unnamed: 585_level_1,Unnamed: 586_level_1,Unnamed: 587_level_1,Unnamed: 588_level_1,Unnamed: 589_level_1,Unnamed: 590_level_1,Unnamed: 591_level_1,Unnamed: 592_level_1,Unnamed: 593_level_1,Unnamed: 594_level_1,Unnamed: 595_level_1
791546620915486720,0.043926,0.003559,0.01273,0.010634,0.007295,0.003559,0.005508,0.003475,0.003559,0.010569,0.128129,0.002989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
793885278553317376,0.0,0.0,0.0,0.010634,0.0,0.0,0.0,0.0,0.0,0.010569,0.128129,0.0,0.006976,0.003559,0.005878,0.007273,0.003475,0.003559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Implementación de SVM con kernel predefinido:

In [20]:
%%time

from sklearn.svm import SVC
from sklearn import metrics
svc=SVC() #Default hyperparameters
svc.fit(train_df,train_tweets["Category"])
y_pred=svc.predict(test_df)
print('Accuracy Score: %f' %metrics.accuracy_score(test_tweets["Category"],y_pred))

Accuracy Score: 0.761261
Wall time: 2.01 s


### Implementación de SVM con kernel lineal:

In [21]:
svc=SVC(kernel='linear')
svc.fit(train_df,train_tweets["Category"])
y_pred=svc.predict(test_df)
print('Accuracy Score: %f' %metrics.accuracy_score(test_tweets["Category"],y_pred))

Accuracy Score: 0.765766


In [22]:
print(y_pred)
print(test_tweets["Category"].values)

['N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N'
 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N'
 'N' 'N' 'N' 'N' 'N' 'C' 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' 'T' 'T' 'T' 'N'
 'T' 'T' 'T' 'N' 'T' 'T' 'T' 'T' 'T' 'T' 'T' 'T' 'T' 'T' 'T' 'T' 'T' 'T'
 'T' 'T' 'T' 'T' 'N' 'T' 'T' 'T' 'N' 'T' 'T' 'T' 'T' 'T' 'T' 'T' 'T' 'T'
 'T' 'T' 'T' 'T' 'T' 'T' 'N' 'T' 'T' 'T' 'T' 'T' 'T' 'T' 'T' 'T' 'T' 'T'
 'T' 'T' 'T' 'T' 'T' 'T' 'T' 'N' 'T' 'T' 'T' 'T' 'T' 'T' 'T' 'T' 'T' 'N'
 'T' 'T' 'T' 'T' 'T' 'T' 'T' 'T' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N'
 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N'
 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N'
 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N'
 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N'
 'N' 'N' 'N' 'N' 'N' 'N']
['A' 'A' 'C' 'C' 'C' 'C' 'C' 'C' 'C' 'C' 'C' 'C' 'C' 'C' 'C' 'C' 'C' 'C'
 'C' 'C' 'C' 'C' 'C' 'C' 

### Cargamos el archivo completo de tweets

In [23]:
%%time

"""Cargo los tweets en una lista, algunas líneas no son tweets completos, 
sino sólo trozos y no es posible cargarlos e incorporarlos al DataFrame, por eso try-except.
Al final con esto lastimosamente dejan de cargarse muchos tweets"""

with open("./Data/CAM20170501-002.json") as data:
    n=0
    m=0
    jr=0
    js=0
    l1=[]
    l2=[]
    l3=[]
    for line in range(1587572):
        try:
            r1=(data.readline())
        except:
            n+=1
            try:
                l1.append(r1)
            except:
                pass
        else: 
            try:
                r2=json.loads(r1)
            except:
                jr+=1
                try:
                    r2=json.loads(json.dumps(r1))
                except:
                    js+=1
                else:
                    m+=1
                    l3.append(r2)
            else:
                m+=1
                l2.append(r2)
    print("Cargados=%d\nPerdidos al leer linea= %d\nPerdidos al cargar json raw = %d\nPerdidos al cargar json string = %d"\
          %(m,n,jr,js))
    print("L1 = %d\nL2 = %d\nL3 = %d" %(len(l1),len(l2), len(l3)))

Cargados=1394502
Perdidos al leer linea= 193070
Perdidos al cargar json raw = 736206
Perdidos al cargar json string = 0
L1 = 193070
L2 = 658296
L3 = 736206
Wall time: 1min 45s


In [24]:
%%time
"""Aquí cargamos los tweets en un DataFrame"""

ll1 = []
for element in l1:
    try:
        ll1.append(json.loads(element))
    except:
        pass
tweets1 = pd.DataFrame(ll1)

Wall time: 42.6 s


In [25]:
print(tweets1.shape)
tweets1.head(2)

(192780, 54)


Unnamed: 0,Xcoord,Ycoord,_id,coordinates,created_at,created_at_localtime,display_text_range,entities,entities-hashtags,entities-media,entities-urls,entities-user_mentions,entitites,extended_entities,extended_tweet,favorite_count,favorited,filter_level,geo,id,in_reply_to_screen_name,in_reply_to_screen_nameFix,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,is_quote_status,lang,place,place-country_code,place-name,possibly_sensitive,quoted_status,quoted_status_id,quoted_status_id_str,retweet_count,retweeted,source,sourceFix,text,textFix,timestamp_ms,timezone_info,truncated,user,user-created_at,user-followers_count,user-friends_count,user-id_str,user-lang,user-listed_count,user-location,user-screen_name,user-statuses_count
0,-3.7058,40.4203,722257977667747843,"{'type': 'Point', 'coordinates': [-3.7058, 40....",{'$date': '2016-04-19T02:58:15.000Z'},{'$date': '2016-04-19T04:58:15.000Z'},,"{'hashtags': [{'indices': [121, 128], 'text': ...","[{'indices': [121, 128], 'text': 'trndnl'}]",,"[{'url': 'https://t.co/K5DFqqcseW', 'indices':...",,,,,0.0,,,,,,,,,,,,es,"{'name': 'Madrid', 'country_code': 'ES'}",ES,Madrid,,,,,0.0,,,,"'Oklahoma' empieza a ser Tendencia, con 16652 ...",,,"{'timezone': 'Europe/Madrid', 'obtained_from':...",,"{'lang': 'es', 'statuses_count': 432224.0, 'sc...",{'$date': '2013-02-14T18:39:28.000Z'},36306.0,44.0,1179981192,es,450.0,EspaÃ±a,trendinaliaES,432224.0
1,-3.7058,40.4203,722269265693310977,"{'type': 'Point', 'coordinates': [-3.7058, 40....",{'$date': '2016-04-19T03:43:06.000Z'},{'$date': '2016-04-19T05:43:06.000Z'},,"{'hashtags': [{'indices': [0, 12], 'text': 'Lu...","[{'indices': [0, 12], 'text': 'LunesDeOjos'}, ...",,"[{'url': 'https://t.co/M8vQMwWlAW', 'indices':...",,,,,0.0,,,,,,,,,,,,es,"{'name': 'Madrid', 'country_code': 'ES'}",ES,Madrid,,,,,0.0,,,,#LunesDeOjos estuvo el lunes 18 como Tendencia...,,,"{'timezone': 'Europe/Madrid', 'obtained_from':...",,"{'lang': 'es', 'statuses_count': 432234.0, 'sc...",{'$date': '2013-02-14T18:39:28.000Z'},36306.0,44.0,1179981192,es,450.0,EspaÃ±a,trendinaliaES,432234.0


In [26]:
tweets3 = pd.DataFrame(l3)
tweets2 = pd.DataFrame(l2)

print(tweets1.shape)
print(tweets2.shape)
print(tweets3.shape)

(192780, 54)
(658296, 54)
(736206, 1)


In [27]:
del tweets3, l1, l2, l3, ll1, data

In [28]:
tw = tweets2.append(tweets1) #formamos un df único
print(tw.shape)
tw = tw.set_index("_id")
print(tw.shape)

(851076, 54)
(851076, 53)


In [29]:
del tweets2, tweets1

In [30]:
tw["texto"]=tw["text"]

### Reemplazamos carácteres extraños por letras simples
(á = Ã¡ , á = Ã\xa0 , É = Ã‰ , é = Ã© , é = Ã¨ , í = Ã\xad , í = Ã¬ , Ó = Ã“ , Ó = Ã’ , ó = Ã³ , Ú = Ãš , ú = Ãº , ü = Ã¼, Ñ = Ã‘ , ñ = Ã± , Ç = Ã‡ , ç = Ã§) 

In [31]:
%%time
tw["text"] = tw["text"].str.replace(r"\Ã\¡","a")
tw["text"] = tw["text"].str.replace(r"\Ã\xa0","a")
tw["text"] = tw["text"].str.replace(r"\Ã\©","e")
tw["text"] = tw["text"].str.replace(r"\Ã\¨","e")
tw["text"] = tw["text"].str.replace(r"\Ã\xad","i")
tw["text"] = tw["text"].str.replace(r"\Ã\¬","i")
tw["text"] = tw["text"].str.replace(r"\Ã\²","o")
tw["text"] = tw["text"].str.replace(r"\Ã\º","u")
tw["text"] = tw["text"].str.replace(r"\Ã\¼","u")
tw["text"] = tw["text"].str.replace(r"\Ã\±","ny")
tw["text"] = tw["text"].str.replace(r"\Ã\§","c")

tw["text"] = tw["text"].str.replace(r"\Ã\‰","E")
tw["text"] = tw["text"].str.replace(r"\Ã\“","O")
tw["text"] = tw["text"].str.replace(r"\Ã\’","O")
tw["text"] = tw["text"].str.replace(r"\Ã\š","U")
tw["text"] = tw["text"].str.replace(r"\Ã\‘","NY")
tw["text"] = tw["text"].str.replace(r"\Ã\‡","C")

tw["text"] = tw["text"].str.lower() # Convertimos a minúsculas

Wall time: 15.5 s


### Tokenización, limpieza y Lematización

In [32]:
%%time
# Eliminamos carácteres no alfanuméricos:
# Eliminamos StopWords:
# Lematizamos:
tw["SUMs"]=tw["text"].apply(lambda x: re.sub(r'[^\w\s]','',str(x))).str.split()\
.apply(lambda x: list(set(filter(lambda y: y not in cachedStopWords,x))))\
.apply(lambda x: [stemmer.stem(token) for token in x])

Wall time: 9min 24s


### Creación de vectores y DataFrame de características:

In [None]:
%%time

tw["feature_vector"] = tw["SUMs"].map(lambda x: feat_vect_creat(x, relevant_stems))
vectores = [vector for vector in tw["feature_vector"].values]
features_df = pd.DataFrame(vectores, columns=relevant_stems.keys(), index=tw.index)

### Etiquetado de tweets
#### Primero normalizamos los vectores

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(features_df)
features_df = scaler.transform(features_df)

In [None]:
tw["category"] =pd.DataFrame(svc.predict(features_df))

### Exportamos los tweets etiquetados a csv

In [None]:
tw.to_csv('./Data/labeled_tweets.csv', sep='¡')

In [None]:
analytic_tweets = tw[["created_at","text", "lang", "Xcoord","Ycoord","SUMs","category"]]

In [None]:
del tw

In [None]:
analytic_tweets.to_csv('./Data/analytic_tweets.csv', sep='¡')