In [1]:
import numpy as np
import pandas as pd
from faker import Faker
fake = Faker()

<b>1. Read data from csv.</b>

In [2]:
df = pd.read_csv('igrzyska_list_7.csv')
df['economical_opinion']

0      4.000000
1      3.333333
2      2.333333
3      4.000000
4      3.000000
         ...   
955    3.666667
956    2.666667
957    2.666667
958    2.666667
959    3.333333
Name: economical_opinion, Length: 960, dtype: float64

<b>2. Normalize values in columns q67s1, q67s2, q67s3 and q67s4.</b>

In [3]:
df["q67s1"]=((df["q67s1"]-df["q67s1"].min())
                        /(df["q67s1"].max()-df["q67s1"].min()))

df["q67s2"]=((df["q67s2"]-df["q67s2"].min())
                        /(df["q67s2"].max()-df["q67s2"].min()))

df["q67s3"]=((df["q67s3"]-df["q67s3"].min())
                        /(df["q67s3"].max()-df["q67s3"].min()))

df["q67s4"]=((df["q67s4"]-df["q67s4"].min())
                        /(df["q67s4"].max()-df["q67s4"].min()))

df[["q67s1", "q67s2", "q67s3", "q67s4"]]

Unnamed: 0,q67s1,q67s2,q67s3,q67s4
0,0.428571,0.428571,0.428571,0.428571
1,0.000000,0.000000,0.000000,0.142857
2,1.000000,1.000000,1.000000,1.000000
3,0.000000,0.000000,0.000000,0.000000
4,0.142857,0.142857,0.142857,0.142857
...,...,...,...,...
955,0.857143,0.857143,0.857143,0.857143
956,0.142857,0.142857,0.142857,0.142857
957,0.000000,0.000000,0.285714,0.428571
958,0.000000,0.000000,0.000000,0.000000


<b>3. Replace NaNs with the most frequent value.</b>

In [4]:
print(f"\nNumber of NaN values: \t {df['q13'].isna().sum()}")

df['q13'] = df['q13'].replace(np.nan, df['q13'].mode().values[0])
      

print(f"q13 column mode: \t {df['q13'].mode().values[0]}",
      f"\nNumber of NaN values: \t {df['q13'].isna().sum()}")
      
df['q13']


Number of NaN values: 	 511
q13 column mode: 	 6.0 
Number of NaN values: 	 0


0       8.0
1       3.0
2       3.0
3       6.0
4      11.0
       ... 
955    11.0
956     6.0
957     6.0
958     6.0
959     6.0
Name: q13, Length: 960, dtype: float64

<b>4. Create leap_year column based on respondent birth date.</b>

In [5]:
df['year_to_datetime'] = pd.to_datetime(df['URODZONY'], format='%Y')
df['leap_year'] = df['year_to_datetime'].dt.is_leap_year
df[['URODZONY', 'year_to_datetime', 'leap_year']]

Unnamed: 0,URODZONY,year_to_datetime,leap_year
0,1940,1940-01-01,True
1,1943,1943-01-01,False
2,1936,1936-01-01,True
3,1948,1948-01-01,True
4,1994,1994-01-01,False
...,...,...,...
955,1991,1991-01-01,False
956,1964,1964-01-01,True
957,1975,1975-01-01,False
958,1984,1984-01-01,True


<b>5. Series' Accessors usage example.</b>

In [6]:
full_name_series = pd.Series(data=[fake.name() for i in range(len(df['woj']))])
df['full_name'] = full_name_series
print('=== Day of week ===',
      f"\n{df['year_to_datetime'].dt.dayofweek}",
      '\n\n=== Check if the full_name column ends with \'e\' ===',
      f"\n{df['full_name'].str.endswith('e')}",
      '\n\n=== Get all categories ===',
      f"\n{df['woj'].astype('category').cat.categories}",
      f"\n\n=== Check % of non-empty values in \'q32\' column ===",
      f"\n{df['q32'].astype(pd.SparseDtype(object)).sparse.density}")



=== Day of week === 
0      0
1      4
2      2
3      3
4      5
      ..
955    1
956    2
957    2
958    6
959    5
Name: year_to_datetime, Length: 960, dtype: int64 

=== Check if the full_name column ends with 'e' === 
0       True
1      False
2       True
3      False
4      False
       ...  
955    False
956    False
957    False
958    False
959    False
Name: full_name, Length: 960, dtype: bool 

=== Get all categories === 
Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], dtype='int64') 

=== Check % of non-empty values in 'q32' column === 
0.004166666666666667


<b>6. Create 5 equal birth date groups and a new column assigning respondent to one of these groups.</b>

In [7]:
print(f"Min value: \t{np.min(df['URODZONY'])}",
      f"\nMax value: \t{np.max(df['URODZONY'])}")
      
min_max_difference = np.max(df['URODZONY']) - np.min(df['URODZONY'])
      
print(f"\nFirst group (very old): \t{np.min(df['URODZONY'])} - {np.min(df['URODZONY']) + 1 * (min_max_difference // 5)}",
      f"\nSecond group (old): \t\t{np.min(df['URODZONY']) + 1 * (min_max_difference // 5) + 1} - {np.min(df['URODZONY']) + 2 * (min_max_difference // 5) + 1}",
      f"\nThird group (middle-aged): \t{np.min(df['URODZONY']) + 2 * (min_max_difference // 5) + 2} - {np.min(df['URODZONY']) + 3 * (min_max_difference // 5) + 2}",
      f"\nFourth group (young): \t\t{np.min(df['URODZONY']) + 3 * (min_max_difference // 5) + 3} - {np.min(df['URODZONY']) + 4 * (min_max_difference // 5) + 3}",
      f"\nFifth group (very young): \t{np.min(df['URODZONY']) + 4 * (min_max_difference // 5) + 4} - {np.min(df['URODZONY']) + 5 * (min_max_difference // 5) + 4}")

def specify_age_group(birth_year):
      age_groups_dict = {
          1938: 'very old',
          1952: 'old',
          1966: 'middle-aged',
          1980: 'young',
          1994: 'very young'
      }
      
      for age in age_groups_dict.keys():
          if birth_year <= age:
              return age_groups_dict[age]
      
df['age_group'] = df['URODZONY'].apply(specify_age_group)

df[['URODZONY','age_group']].head(10)

Min value: 	1925 
Max value: 	1994

First group (very old): 	1925 - 1938 
Second group (old): 		1939 - 1952 
Third group (middle-aged): 	1953 - 1966 
Fourth group (young): 		1967 - 1980 
Fifth group (very young): 	1981 - 1994


Unnamed: 0,URODZONY,age_group
0,1940,old
1,1943,old
2,1936,very old
3,1948,old
4,1994,very young
5,1928,very old
6,1979,young
7,1936,very old
8,1972,young
9,1976,young
