In [1]:
import pandas as pd
from auxiliarFunctions import *


## Stage 1: Extraction, parse and save data
In this step, we perform:
- Data frame extraction and conversion to csv file. The amount of extractions and conversions to be performed is decided through an input in the function.
- During data frame extraction, the desired number of users for each csv file is also entered with an input.
- If the directory already has csv files, the extraction will not eliminate them, it will only add the desired amount. 

In [None]:
extracParseSave(numberOfUsers=50, numberOfFiles=5)

## Stage 2: Creating a geral data frame
- Create a geral data frame with the csv files created.


In [2]:
geralDataFrame = creatingGeralDF()
geralDataFrame

Unnamed: 0.1,Unnamed: 0,title,fName,lName,city,country,email,phone,dob
0,0,Mr,Eli,Davidson,Jackson,United States,eli.davidson@example.com,(697) 664-4260,1989-07-24T05:23:17.106Z
1,1,Ms,Elia,Dumas,Nancy,France,elia.dumas@example.com,01-50-88-64-71,1982-05-25T06:10:50.600Z
2,2,Mr,Randolfo,Cavalcanti,Poá,Brazil,randolfo.cavalcanti@example.com,(48) 1655-5778,1946-02-12T15:55:57.934Z
3,3,Ms,Vlatka,Čabarkapa,Vrbas,Serbia,vlatka.cabarkapa@example.com,022-4423-823,1990-05-19T20:40:05.019Z
4,4,Monsieur,Ricardo,Fabre,Rhäzüns,Switzerland,ricardo.fabre@example.com,078 065 87 60,1955-03-06T16:31:03.765Z
...,...,...,...,...,...,...,...,...,...
45,45,Monsieur,Ludovic,Leroy,Cudrefin,Switzerland,ludovic.leroy@example.com,076 942 24 72,1966-07-24T13:05:26.442Z
46,46,Miss,Ava,Nguyen,Sunderland,United Kingdom,ava.nguyen@example.com,0113753 355 4720,1955-11-27T10:19:15.136Z
47,47,Mr,Mimon,Duarte,Araçatuba,Brazil,mimon.duarte@example.com,(55) 8995-8577,1972-09-08T04:42:06.419Z
48,48,Miss,Charlie,Miller,Port Elgin,Canada,charlie.miller@example.com,Q67 N80-7046,1999-04-01T11:10:44.700Z


## Stage 3: 
Formatting the dataframe to make it more presentable.
- Deleting unwanted columns;
- Changing column titles;


In [3]:
geralDataFrame = deleteColumn(geralDataFrame,'Unnamed: 0')

In [4]:
geralDataFrame.columns = ['Title','Name','Surname','City','Country','Email','Phone','Birthdate']

In [5]:
geralDataFrame

Unnamed: 0,Title,Name,Surname,City,Country,Email,Phone,Birthdate
0,Mr,Eli,Davidson,Jackson,United States,eli.davidson@example.com,(697) 664-4260,1989-07-24T05:23:17.106Z
1,Ms,Elia,Dumas,Nancy,France,elia.dumas@example.com,01-50-88-64-71,1982-05-25T06:10:50.600Z
2,Mr,Randolfo,Cavalcanti,Poá,Brazil,randolfo.cavalcanti@example.com,(48) 1655-5778,1946-02-12T15:55:57.934Z
3,Ms,Vlatka,Čabarkapa,Vrbas,Serbia,vlatka.cabarkapa@example.com,022-4423-823,1990-05-19T20:40:05.019Z
4,Monsieur,Ricardo,Fabre,Rhäzüns,Switzerland,ricardo.fabre@example.com,078 065 87 60,1955-03-06T16:31:03.765Z
...,...,...,...,...,...,...,...,...
45,Monsieur,Ludovic,Leroy,Cudrefin,Switzerland,ludovic.leroy@example.com,076 942 24 72,1966-07-24T13:05:26.442Z
46,Miss,Ava,Nguyen,Sunderland,United Kingdom,ava.nguyen@example.com,0113753 355 4720,1955-11-27T10:19:15.136Z
47,Mr,Mimon,Duarte,Araçatuba,Brazil,mimon.duarte@example.com,(55) 8995-8577,1972-09-08T04:42:06.419Z
48,Miss,Charlie,Miller,Port Elgin,Canada,charlie.miller@example.com,Q67 N80-7046,1999-04-01T11:10:44.700Z


# Stage 4 - Gender

- Identify the genre through the user's title.
- Create a series with the percentage of each genre.
- Convert this series to a dictionary to show the percentage in a string.


In [6]:
percentage = getColumnPercentage(geralDataFrame, 'Title')
percentage

woman    52.7
man      47.3
Name: Title, dtype: float64

In [7]:
genderPercentage = getGenderPercentage(percentage)
print(f"We have {genderPercentage['malePercentage']:.1f}% of men and {genderPercentage['femalePercentage']:.1f}% of women in this DataFrame.")

We have 47.3% of men and 52.7% of women in this DataFrame.


# Stage 5 - Country Percentage
- Using the previous function, we can find the percentage of each country in the data frame.

In [8]:
CountryPercentage = getColumnPercentage(geralDataFrame, 'Country')

In [9]:
CountryPercentage

India             6.3
United Kingdom    6.0
Finland           6.0
Mexico            6.0
United States     5.7
France            5.3
Ireland           5.3
Denmark           5.3
Norway            5.0
New Zealand       5.0
Turkey            4.7
Netherlands       4.7
Australia         4.7
Canada            4.3
Switzerland       4.3
Iran              4.3
Spain             4.0
Serbia            4.0
Ukraine           3.3
Brazil            3.3
Germany           2.3
Name: Country, dtype: float64

# Stage 6 - Birthdate
- By isolating the gender numbers and the years, we were able to generate a new data frame with the total number of births in each year, together with the percentage of each gender.


In [10]:
birthsRelatory = countingBirths(geralDataFrame)


In [11]:
birthsRelatory

Unnamed: 0_level_0,Borns,manPorcetage,womanPorcetage
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1944,3.0,66.7,33.3
1945,3.0,33.3,66.7
1946,7.0,71.4,28.6
1947,6.0,66.7,33.3
1948,7.0,28.6,71.4
1949,8.0,25.0,75.0
1950,8.0,50.0,50.0
1951,3.0,33.3,66.7
1952,8.0,25.0,75.0
1953,5.0,60.0,40.0
