# Leer XML (manual)

In [2]:
import pandas as pd
import xml.etree.ElementTree as et 

xtree = et.parse("tweets_es_interTASS.xml")
xroot = xtree.getroot()

df_cols = ["tweetid", "user", "content", "date", "lang", "sentiment"]
rows = []
pd.set_option('display.max_colwidth', -1)

In [44]:
for node in xroot: 
    t_id = node.find("tweetid").text
    t_user = node.find("user").text
    t_content = node.find("content").text
    t_date = node.find("date").text
    t_lang = node.find("lang").text
    t_sentiment = node.find("./sentiment/polarity/value").text
    
    rows.append({"tweetid": t_id, "user":t_user, "content": t_content, "date": t_date, "lang": t_lang, "sentiment": t_sentiment})
    out_df = pd.DataFrame(rows, columns=df_cols)

In [48]:
out_df.to_csv("tweets_TAAS.csv", index = None, header=True)

# Funcion general

In [5]:
import pandas as pd
import xml.etree.ElementTree as et

def parse_XML(xml_file, df_cols): 
    """Parse the input XML file and store the result in a pandas 
    DataFrame with the given columns. 
    
    The first element of df_cols is supposed to be the identifier 
    variable, which is an attribute of each node element in the 
    XML data; other features will be parsed from the text content 
    of each sub-element. 
    """
    
    xtree = et.parse(xml_file)
    xroot = xtree.getroot()
    rows = []
    
    for node in xroot: 
        res = []
        for el in df_cols: 
            if node is not None and node.find(el) is not None:
                if el == df_cols[-1]:
                    res.append(node.find("./sentiments/polarity/value").text)
                else:
                    res.append(node.find(el).text)
            else: 
                res.append(None)
        rows.append({df_cols[i]: res[i] 
                     for i, _ in enumerate(df_cols)})
    
    out_df = pd.DataFrame(rows, columns=df_cols)
        
    return out_df

In [6]:
df_cols = ["tweetid", "user", "content", "date", "lang", "sentiments"]
out_df = parse_XML("general-test-tagged-3l.xml", df_cols)

In [7]:
out_df.rename(columns = {"sentiments": "sentiment"}, inplace =True)
#out_df.columns

In [8]:
out_df.to_csv("tweets_TASS_General_Development.csv", index = None, header=True)
df = pd.read_csv("tweets_TASS_General_Development.csv")
df.head(10)

Unnamed: 0,tweetid,user,content,date,lang,sentiment
0,142378325086715906,jesusmarana,"Portada 'Público', viernes. Fabra al banquillo...",2011-12-02T00:03:32,es,N
1,142379080808013825,EvaORegan,"Grande! RT @veronicacalderon ""El periodista es...",2011-12-02T00:06:32,es,NONE
2,142379173120442368,LosadaPescador,Gonzalo Altozano tras la presentación de su li...,2011-12-02T00:06:55,es,P
3,142379815708803072,mgilguerrero,"Mañana en Gaceta: TVE, la que pagamos tú y yo,...",2011-12-02T00:09:28,es,N
4,142381190123499520,pedroj_ramirez,Qué envidia “@mfcastineiras: Pedro mañana x la...,2011-12-02T00:14:55,es,NONE
5,142382515380961280,mgilguerrero,Más mañana en Gaceta. Amaiur depende de Uxue B...,2011-12-02T00:20:11,es,N
6,142382561501511680,SSantiagosegura,"Muy buenas noches followercetes, mañana va a s...",2011-12-02T00:20:23,es,P
7,142382722910912512,mgilguerrero,Más de mañana en Gaceta. UPyD contará casi seg...,2011-12-02T00:21:01,es,P
8,142384554206961664,mariviromero,"La felicidad no esta en los grandes anhelos , ...",2011-12-02T00:28:17,es,P
9,142386873539637248,mgilguerrero,"""Ya lo veremos, ya lo veremos..."" les ha respo...",2011-12-02T00:37:30,es,N


In [9]:
print(df.shape)

(60798, 6)


# Juntar los conjuntos de entrenamiento y pruebas 

## Juntar los train set

In [6]:
tweets_general_df = pd.read_csv("tweets_TASS_General.csv")

In [7]:
tweets_intertass_es_df = pd.read_csv("tweets_interTAAS_Train_ES.csv")

In [8]:
tweets_intertass_cr_df = pd.read_csv("tweets_interTAAS_Train_CR.csv")

In [9]:
tweets_intertass_pe_df = pd.read_csv("tweets_interTAAS_Train_PE.csv")

In [10]:
print(tweets_general_df.shape)
print(tweets_intertass_es_df.shape)
print(tweets_intertass_cr_df.shape)
print(tweets_intertass_pe_df.shape)

(7219, 6)
(1008, 6)
(800, 6)
(1000, 6)


In [11]:
tweets_es = tweets_general_df.append(tweets_intertass_es_df).append(tweets_intertass_cr_df).append(tweets_intertass_pe_df)
tweets_es.shape

(10027, 6)

In [12]:
tweets_es.to_csv("tweets_es.csv", index = None, header=True)

## Juntar los test set

In [10]:
tweets_general_test_df = pd.read_csv("tweets_TASS_General_Development.csv")

In [11]:
tweets_intertass_es_test_df = pd.read_csv("tweets_interTASS_Development_ES.csv")

In [12]:
tweets_intertass_cr_test_df = pd.read_csv("tweets_interTASS_Development_CR.csv")

In [13]:
tweets_intertass_pe_test_df = pd.read_csv("tweets_interTASS_Development_PE.csv")

In [14]:
print(tweets_general_test_df.shape)
print(tweets_intertass_es_test_df.shape)
print(tweets_intertass_cr_test_df.shape)
print(tweets_intertass_pe_test_df.shape)

(39382, 6)
(506, 6)
(300, 6)
(500, 6)


In [16]:
tweets_es_test = tweets_general_test_df.append(tweets_intertass_es_test_df).append(tweets_intertass_cr_test_df).append(tweets_intertass_pe_test_df)
tweets_es_test.shape

(40688, 6)

In [17]:
tweets_es_test.rename(columns = {"content": "data_lemmatized"}, inplace =True)
tweets_es_test.columns

Index(['tweetid', 'user', 'data_lemmatized', 'date', 'lang', 'sentiment'], dtype='object')

In [18]:
tweets_es_test.to_csv("tweets_es_development.csv", index = None, header=True)