# Leer XML (manual)

In [2]:
import pandas as pd
import xml.etree.ElementTree as et 

xtree = et.parse("tweets_es_interTASS.xml")
xroot = xtree.getroot()

df_cols = ["tweetid", "user", "content", "date", "lang", "sentiment"]
rows = []
pd.set_option('display.max_colwidth', -1)

In [44]:
for node in xroot: 
    t_id = node.find("tweetid").text
    t_user = node.find("user").text
    t_content = node.find("content").text
    t_date = node.find("date").text
    t_lang = node.find("lang").text
    t_sentiment = node.find("./sentiment/polarity/value").text
    
    rows.append({"tweetid": t_id, "user":t_user, "content": t_content, "date": t_date, "lang": t_lang, "sentiment": t_sentiment})
    out_df = pd.DataFrame(rows, columns=df_cols)

In [48]:
out_df.to_csv("tweets_TAAS.csv", index = None, header=True)

# Funcion general

In [17]:
import pandas as pd
import xml.etree.ElementTree as et

def parse_XML(xml_file, df_cols): 
    """Parse the input XML file and store the result in a pandas 
    DataFrame with the given columns. 
    
    The first element of df_cols is supposed to be the identifier 
    variable, which is an attribute of each node element in the 
    XML data; other features will be parsed from the text content 
    of each sub-element. 
    """
    
    xtree = et.parse(xml_file)
    xroot = xtree.getroot()
    rows = []
    
    for node in xroot: 
        res = []
        for el in df_cols: 
            if node is not None and node.find(el) is not None:
                if el == df_cols[-1]:
                    res.append(node.find("./sentiment/polarity/value").text)
                else:
                    res.append(node.find(el).text)
            else: 
                res.append(None)
        rows.append({df_cols[i]: res[i] 
                     for i, _ in enumerate(df_cols)})
    
    out_df = pd.DataFrame(rows, columns=df_cols)
        
    return out_df

In [18]:
df_cols = ["tweetid", "user", "content", "date", "lang", "sentiment"]
out_df = parse_XML("intertass-PE-test.xml", df_cols)

In [19]:
#out_df.rename(columns = {"sentiments": "sentiment"}, inplace =True)
#out_df.columns

In [20]:
out_df.to_csv("tweets_interTAAS_Test_PE.csv", index = None, header=True)
df = pd.read_csv("tweets_interTAAS_Test_PE.csv")
df.head(10)

Unnamed: 0,tweetid,user,content,date,lang,sentiment
0,769690830114357248,744639307906953216,@MundonickLA @mgabrieladfc siempre hermosa mar...,Sun Aug 28 00:19:28 +0000 2016,es,
1,771077379531821057,713834336,"El sábado me dijeron ""yo te he visto antes, pe...",Wed Aug 31 20:09:07 +0000 2016,es,
2,772489016352669701,53311422,Sabes que no tendrás un buen día cuando lo pri...,Sun Sep 04 17:38:28 +0000 2016,es,
3,771317218634149888,599653674,"En situaciones en las que no sepas que hacer, ...",Thu Sep 01 12:02:09 +0000 2016,es,
4,771316436107014144,599653674,El Universo es infinito y como tal quiere que ...,Thu Sep 01 11:59:03 +0000 2016,es,
5,771720050017460225,1489810838,Cusco again Días felices #AmoCusco #Urubamba #...,Fri Sep 02 14:42:52 +0000 2016,es,
6,771519053211004929,783350538,En el examen de geometría me estoy esforzando ...,Fri Sep 02 01:24:11 +0000 2016,es,
7,772626202238578688,91282238,Los putos polos esos que se cruzan en el pecho...,Mon Sep 05 02:43:35 +0000 2016,es,
8,772619014820560896,1694172402,@Trovack @iEnterate vamos por buen camino,Mon Sep 05 02:15:02 +0000 2016,es,
9,772993402791202816,2393444238,#HaceTiempoYoNo Tengo un novio formal,Tue Sep 06 03:02:43 +0000 2016,es,


In [5]:
print(df.shape)

(7219, 6)


# Juntar los conjuntos de entrenamiento y pruebas 

## Juntar los train set

In [6]:
tweets_general_df = pd.read_csv("tweets_TASS_General.csv")

In [7]:
tweets_intertass_es_df = pd.read_csv("tweets_interTAAS_Train_ES.csv")

In [8]:
tweets_intertass_cr_df = pd.read_csv("tweets_interTAAS_Train_CR.csv")

In [9]:
tweets_intertass_pe_df = pd.read_csv("tweets_interTAAS_Train_PE.csv")

In [10]:
print(tweets_general_df.shape)
print(tweets_intertass_es_df.shape)
print(tweets_intertass_cr_df.shape)
print(tweets_intertass_pe_df.shape)

(7219, 6)
(1008, 6)
(800, 6)
(1000, 6)


In [11]:
tweets_es = tweets_general_df.append(tweets_intertass_es_df).append(tweets_intertass_cr_df).append(tweets_intertass_pe_df)
tweets_es.shape

(10027, 6)

In [12]:
tweets_es.to_csv("tweets_es.csv", index = None, header=True)

## Juntar los test set

In [22]:
tweets_intertass_es_test_df = pd.read_csv("tweets_interTASS_Test_ES.csv")

In [23]:
tweets_intertass_cr_test_df = pd.read_csv("tweets_interTAAS_Test_CR.csv")

In [24]:
tweets_intertass_pe_test_df = pd.read_csv("tweets_interTAAS_Test_PE.csv")

In [25]:
#print(tweets_general_df.shape)
print(tweets_intertass_es_test_df.shape)
print(tweets_intertass_cr_test_df.shape)
print(tweets_intertass_pe_test_df.shape)

(1899, 6)
(1233, 6)
(1428, 6)


In [26]:
tweets_es_test = tweets_intertass_es_test_df.append(tweets_intertass_cr_test_df).append(tweets_intertass_pe_test_df)
tweets_es_test.shape

(4560, 6)

In [28]:
tweets_es_test.rename(columns = {"content": "data_lemmatized"}, inplace =True)
tweets_es_test.columns

Index(['tweetid', 'user', 'data_lemmatized', 'date', 'lang', 'sentiment'], dtype='object')

In [29]:
tweets_es_test.to_csv("tweets_es_test.csv", index = None, header=True)