
## DATA CLEANING

Script to clean data deleting unuseful information and processing  the data in order to make it suitable for the model



In [3]:
import pandas as pd
import re
import string
import unidecode

In [4]:
with open('text.txt') as f:
    text=f.read()

In [5]:
d = {'Texto' : pd.Series([text], index=[0])}

In [7]:
data_df = pd.DataFrame.from_dict(d).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df

Unnamed: 0,transcript
Texto,Sería uno de los grandes hallazgos de la egipt...


In [10]:
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?¿\]\%', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\w*\d\w*', '', text)
    text = unidecode.unidecode(text)
    return text

round1 = lambda x: clean_text_round1(x)

In [11]:
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
data_clean

Unnamed: 0,transcript
Texto,seria uno de los grandes hallazgos de la egipt...


In [12]:
# Segundo round
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…«»]', '', text)
    text = re.sub('\n', ' ', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [13]:
# veamos como queda
data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
data_clean

Unnamed: 0,transcript
Texto,seria uno de los grandes hallazgos de la egipt...


In [14]:
from sklearn.feature_extraction.text import CountVectorizer

with open('español.txt') as f:
    lines = f.read().splitlines()

cv = CountVectorizer(stop_words=lines)
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,abogada,aceptar,acogida,afirmacion,agonizando,agrega,aguas,ahora,alejandria,alli,...,vii,viperinas,visito,voluptuosidad,voz,william,xii,yace,yacimiento,zahi
Texto,2,1,1,1,1,1,3,3,5,3,...,2,1,1,1,2,2,1,1,1,3


In [15]:
data = data_dtm.transpose()
data.head()

Unnamed: 0,Texto
abogada,2
aceptar,1
acogida,1
afirmacion,1
agonizando,1


In [16]:
# Find the top 30 words (per Year)
top_dict = {}
for c in data.columns:
    top = data[c].sort_values(ascending=False).head(30)
    top_dict[c]= list(zip(top.index, top.values))

top_dict

{'Texto': [('cleopatra', 15),
  ('tumba', 12),
  ('antonio', 8),
  ('marco', 7),
  ('ultima', 7),
  ('hawass', 7),
  ('magna', 6),
  ('taposiris', 6),
  ('egipto', 6),
  ('dos', 6),
  ('anos', 6),
  ('martinez', 5),
  ('hace', 5),
  ('alejandria', 5),
  ('mundo', 5),
  ('nefertiti', 5),
  ('antiguo', 4),
  ('mas', 4),
  ('declaraciones', 4),
  ('diario', 3),
  ('shakespeare', 3),
  ('seria', 3),
  ('sepultura', 3),
  ('zahi', 3),
  ('grandes', 3),
  ('segun', 3),
  ('enigmas', 3),
  ('entrevista', 3),
  ('resolver', 3),
  ('excavado', 3)]}