# Leer XML (manual)

In [2]:
import pandas as pd
import xml.etree.ElementTree as et 

xtree = et.parse("tweets_es_interTASS.xml")
xroot = xtree.getroot()

df_cols = ["tweetid", "user", "content", "date", "lang", "sentiment"]
rows = []
pd.set_option('display.max_colwidth', -1)

In [44]:
for node in xroot: 
    t_id = node.find("tweetid").text
    t_user = node.find("user").text
    t_content = node.find("content").text
    t_date = node.find("date").text
    t_lang = node.find("lang").text
    t_sentiment = node.find("./sentiment/polarity/value").text
    
    rows.append({"tweetid": t_id, "user":t_user, "content": t_content, "date": t_date, "lang": t_lang, "sentiment": t_sentiment})
    out_df = pd.DataFrame(rows, columns=df_cols)

In [48]:
out_df.to_csv("tweets_TAAS.csv", index = None, header=True)

# Funcion general

In [50]:
import pandas as pd
import xml.etree.ElementTree as et

def parse_XML(xml_file, df_cols): 
    """Parse the input XML file and store the result in a pandas 
    DataFrame with the given columns. 
    
    The first element of df_cols is supposed to be the identifier 
    variable, which is an attribute of each node element in the 
    XML data; other features will be parsed from the text content 
    of each sub-element. 
    """
    
    xtree = et.parse(xml_file)
    xroot = xtree.getroot()
    rows = []
    
    for node in xroot: 
        res = []
        for el in df_cols: 
            if node is not None and node.find(el) is not None:
                if el == df_cols[-1]:
                    res.append(node.find("./sentiments/polarity/value").text)
                else:
                    res.append(node.find(el).text)
            else: 
                res.append(None)
        rows.append({df_cols[i]: res[i] 
                     for i, _ in enumerate(df_cols)})
    
    out_df = pd.DataFrame(rows, columns=df_cols)
        
    return out_df

In [52]:
df_cols = ["tweetid", "user", "content", "date", "lang", "sentiments"]
out_df = parse_XML("tweets_es_TASS2012.xml", df_cols)

In [53]:
out_df.rename(columns = {"sentiments": "sentiment"}, inplace =True)
out_df.columns

Index(['tweetid', 'user', 'content', 'date', 'lang', 'sentiment'], dtype='object')

In [54]:
out_df.to_csv("tweets_TASS_General.csv", index = None, header=True)
df = pd.read_csv("tweets_TASS_General.csv")
df.head(10)

Unnamed: 0,tweetid,user,content,date,lang,sentiment
0,142389495503925248,ccifuentes,"Salgo de #VeoTV , que día más largoooooo...",2011-12-02T00:47:55,es,NONE
1,142389933619945473,CarmendelRiego,@PauladeLasHeras No te libraras de ayudar me/nos. Besos y gracias,2011-12-02T00:49:40,es,NEU
2,142391947707940864,CarmendelRiego,@marodriguezb Gracias MAR,2011-12-02T00:57:40,es,P
3,142416095012339712,mgilguerrero,"Off pensando en el regalito Sinde, la que se va de la SGAE cuando se van sus corruptos. Intento no sacar conclusiones (lo intento)",2011-12-02T02:33:37,es,N
4,142422495721562112,paurubio,Conozco a alguien q es adicto al drama! Ja ja ja te suena d algo!,2011-12-02T02:59:03,es,P
5,142424715175280640,paurubio,"RT @FabHddzC: Si amas a alguien, déjalo libre. Si grita ese hombre es mío era @paurubio...",2011-12-02T03:07:52,es,NONE
6,142483342040907776,Carlos_Latre,Toca @crackoviadeTV3 . Grabación dl especial Navideño...Mari crismas!,2011-12-02T07:00:50,es,P
7,142493511634259968,Ignacos,Hoy asisitiré en Madrid a un seminario sobre la Estrategia Española de Seguridad organizado por FAES.,2011-12-02T07:41:15,es,NONE
8,142494476051562496,nacho_uriarte,Buen día todos! Lo primero mandar un abrazo grande a Miguel y a su familia @libertadmontes Hoy podría ser un día para la grandeza humana.,2011-12-02T07:45:05,es,P
9,142496796416016384,JuanraLucas,Desde el escaño. Todo listo para empezar #endiascomohoy en el Congreso http://t.co/Mu2yIgCb,2011-12-02T07:54:19,es,P


In [26]:
print(df.shape)

(506, 6)


# Juntar los conjuntos de entrenamiento y pruebas 

## Juntar los train set

In [55]:
tweets_general_df = pd.read_csv("tweets_TASS_General.csv")

In [56]:
tweets_intertass_es_df = pd.read_csv("tweets_interTAAS_Train_ES.csv")

In [57]:
tweets_intertass_cr_df = pd.read_csv("tweets_interTAAS_Train_CR.csv")

In [58]:
tweets_intertass_pe_df = pd.read_csv("tweets_interTAAS_Train_PE.csv")

In [59]:
print(tweets_general_df.shape)
print(tweets_intertass_es_df.shape)
print(tweets_intertass_cr_df.shape)
print(tweets_intertass_pe_df.shape)

(7219, 6)
(1008, 6)
(800, 6)
(1000, 6)


In [60]:
tweets_es = tweets_general_df.append(tweets_intertass_es_df).append(tweets_intertass_cr_df).append(tweets_intertass_pe_df)
tweets_es.shape

(10027, 6)

In [61]:
tweets_es.to_csv("tweets_es.csv", index = None, header=True)