## Importing the Libraries and the DataSets

In [1]:
import pandas as pd
import numpy as np
import unicodedata
df = pd.read_csv('\data\palavras.txt', sep="\n", header=None)
df_names = pd.read_csv('\data\dados_nome.csv', header=None)
df_countries = pd.read_csv('\data\paises.csv', sep=',', header=None, encoding="latin")
s_countries = df_countries[1]

df_names.columns = ['idx', 'Words']
df_names = df_names.drop(0)
df_names = df_names.drop(columns=['idx'])

df.columns = ['Words']

oldCount = df['Words'].count()
df.describe()


Unnamed: 0,Words
count,261797
unique,245365
top,calcareis
freq,4


## Filtering the Data

In [2]:
def filter(word):
    aspas = chr(39)
    space = chr(32)
    if len(word) < 3 or len(word) > 10: return False
    if '-' in word or aspas in word or space in word: return False
    if word.lower() != word: return False
    return True



df = df.drop_duplicates()
df.describe()


Unnamed: 0,Words
count,245365
unique,245365
top,eletroquimicos
freq,1


### Replacing mounths

In [3]:
mounths = {'Words': 
           [ 
            'janeiro', 'fevereiro', 
            'março', 'abril', 'maio', 
            'junho', 'julho', 'agosto', 
            'setembro', 'outubro',
            'novembro', 'dezembro'
           ]
           
          }

mounths_df = pd.DataFrame(mounths)
df = pd.concat([df, mounths_df])

df.tail(12)


Unnamed: 0,Words
0,janeiro
1,fevereiro
2,março
3,abril
4,maio
5,junho
6,julho
7,agosto
8,setembro
9,outubro


### Replacing countries

In [4]:
s_countries = s_countries.str.lower()
s_countries = s_countries.str.strip()
s_countries = s_countries.apply(lambda val: unicodedata.normalize('NFKD', val).encode('ascii', 'ignore').decode())
dict_countries = {'Words': s_countries.values}
df_countries = pd.DataFrame(dict_countries)

df = df.append(df_countries)

df.tail(10)

Unnamed: 0,Words
186,dominica
187,ilhas marshall
188,sao cristovao e nevis
189,liechtenstein
190,monaco
191,sao marino
192,palau
193,nauru
194,tuvalu
195,vaticano


### Reducing the Dataframe

In [5]:
df = df[df['Words'].map(filter)]

df = df.merge(df_names, how='outer' ,indicator=True).loc[lambda word : word['_merge']=='left_only']
df = df.drop(columns = ['_merge'])

newCount = df['Words'].count()

print(newCount)

144817


## Saving Data

In [7]:

np.savetxt('wordbank.txt', df.values, fmt='%s', delimiter='\n', encoding='utf-8')

### Final Resuls

In [6]:
porc = newCount/oldCount * 100

print('The new data represents only {:.2f}% of the old data'.format(porc))


The new data represents only 55.32% of the old data
