In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
print("loading dataset..")
dataset = pd.read_csv('./young-people-survey/responses.csv')
dataset.head()

Unnamed: 0,Music,Slow songs or fast songs,Dance,Folk,Country,Classical music,Musical,Pop,Rock,Metal or Hardrock,...,Age,Height,Weight,Number of siblings,Gender,Left - right handed,Education,Only child,Village - town,House - block of flats
0,5.0,3.0,2.0,1.0,2.0,2.0,1.0,5.0,5.0,1.0,...,20.0,163.0,48.0,1.0,female,right handed,college/bachelor degree,no,village,block of flats
1,4.0,4.0,2.0,1.0,1.0,1.0,2.0,3.0,5.0,4.0,...,19.0,163.0,58.0,2.0,female,right handed,college/bachelor degree,no,city,block of flats
2,5.0,5.0,2.0,2.0,3.0,4.0,5.0,3.0,5.0,3.0,...,20.0,176.0,67.0,2.0,female,right handed,secondary school,no,city,block of flats
3,5.0,3.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,...,22.0,172.0,59.0,1.0,female,right handed,college/bachelor degree,yes,city,house/bungalow
4,5.0,3.0,4.0,3.0,2.0,4.0,3.0,5.0,3.0,1.0,...,20.0,170.0,59.0,1.0,female,right handed,secondary school,no,village,house/bungalow


In [3]:
print("values for Smoking:")
print(dataset.Smoking.unique())
print("values for Alcohol:")
print(dataset.Alcohol.unique())
print("values for Punctuality:")
print(dataset.Punctuality.unique())
print("values for Lying:")
print(dataset.Lying.unique())
print("values for Internet usage:")
print(dataset['Internet usage'].unique())
print("values for Gender:")
print(dataset.Gender.unique())
print("values for Education:")
print(dataset.Education.unique())
print("values for Only child:")
print(dataset['Only child'].unique())
print("values for left - right handed:")
print(dataset['Left - right handed'].unique())
print("values for Village - town:")
print(dataset['Village - town'].unique())
print("values for House - block of flats:")
print(dataset['House - block of flats'].unique())

values for Smoking:
['never smoked' 'tried smoking' 'former smoker' 'current smoker' nan]
values for Alcohol:
['drink a lot' 'social drinker' 'never' nan]
values for Punctuality:
['i am always on time' 'i am often early' 'i am often running late' nan]
values for Lying:
['never' 'sometimes' 'only to avoid hurting someone'
 'everytime it suits me' nan]
values for Internet usage:
['few hours a day' 'most of the day' 'less than an hour a day'
 'no time at all']
values for Gender:
['female' 'male' nan]
values for Education:
['college/bachelor degree' 'secondary school' 'primary school'
 'masters degree' 'doctorate degree' 'currently a primary school pupil'
 nan]
values for Only child:
['no' 'yes' nan]
values for left - right handed:
['right handed' 'left handed' nan]
values for Village - town:
['village' 'city' nan]
values for House - block of flats:
['block of flats' 'house/bungalow' nan]


## Feature exploration

Let's see if, considering features with strings value, they have very rare values that we can merge together, or let's see how the age values are distributed and if it makes sense to encode them in a certain way.

In [4]:
pd.value_counts(dataset.Smoking)

tried smoking     430
never smoked      208
current smoker    189
former smoker     175
Name: Smoking, dtype: int64

In [5]:
pd.value_counts(dataset.Alcohol)

social drinker    659
drink a lot       222
never             124
Name: Alcohol, dtype: int64

In [6]:
pd.value_counts(dataset.Lying)

sometimes                        549
only to avoid hurting someone    270
everytime it suits me            138
never                             51
Name: Lying, dtype: int64

In [7]:
pd.value_counts(dataset.Punctuality)

i am always on time        399
i am often early           327
i am often running late    282
Name: Punctuality, dtype: int64

In [8]:
pd.value_counts(dataset.Education)

secondary school                    621
college/bachelor degree             212
masters degree                       81
primary school                       80
currently a primary school pupil     10
doctorate degree                      5
Name: Education, dtype: int64

given this, we can merge currently a primary school pupil with primary school, also looking the Age Column cannot be persons that are currently doing the primary school, considering that the youngest are 15.
We cannot delete 10 rows of the dataset, even if we have to take into account that these rows can possibly contain more mistakes. 

In [9]:
pd.value_counts(dataset['Internet usage'])

few hours a day            744
less than an hour a day    139
most of the day            124
no time at all               3
Name: Internet usage, dtype: int64

In [10]:
pd.value_counts(dataset.Age)

19.0    210
20.0    194
21.0    127
18.0    123
22.0     84
17.0     53
23.0     47
25.0     30
16.0     29
24.0     28
28.0     17
26.0     15
27.0     14
29.0     11
15.0     11
30.0     10
Name: Age, dtype: int64

As we can see from Ages values, is not possible to have someone that is currently a primary school pupil, can this is an error?

In [11]:
pd.value_counts(dataset.Weight)

60.0     76
55.0     62
70.0     51
80.0     50
65.0     49
75.0     41
50.0     41
58.0     31
57.0     31
53.0     28
63.0     28
62.0     26
54.0     25
52.0     23
68.0     23
56.0     22
48.0     21
85.0     20
78.0     19
67.0     18
64.0     18
90.0     17
72.0     16
83.0     15
76.0     15
77.0     15
51.0     14
49.0     13
74.0     12
73.0     11
         ..
61.0      7
79.0      6
95.0      5
45.0      5
92.0      5
89.0      5
46.0      5
86.0      4
120.0     3
96.0      3
44.0      3
87.0      3
98.0      3
93.0      3
100.0     3
91.0      2
97.0      2
43.0      2
113.0     1
99.0      1
125.0     1
150.0     1
111.0     1
42.0      1
110.0     1
105.0     1
101.0     1
41.0      1
103.0     1
165.0     1
Name: Weight, Length: 69, dtype: int64

165.0 and 150.0 are clearly outliers, maybe they can be heights instead of weights,I want to remove them as may be wrong

In [12]:
dataset = dataset[dataset.Weight != 150.0]
dataset = dataset[dataset.Weight != 165.0]

In [13]:
pd.value_counts(dataset.Height)

170.0    86
168.0    62
175.0    56
180.0    55
165.0    54
185.0    43
178.0    43
173.0    41
172.0    38
163.0    34
167.0    29
160.0    28
183.0    24
171.0    23
164.0    23
176.0    22
182.0    22
162.0    20
169.0    19
187.0    19
184.0    19
174.0    19
177.0    18
166.0    18
190.0    17
186.0    16
189.0    15
158.0    15
188.0    13
179.0    13
161.0    10
192.0     9
181.0     9
157.0     9
193.0     8
156.0     5
159.0     5
155.0     5
195.0     4
194.0     4
153.0     3
154.0     3
200.0     2
191.0     2
197.0     2
203.0     2
148.0     1
152.0     1
62.0      1
Name: Height, dtype: int64

62.0 is clearly an outlier, maybe is it a weight instead of a height, I want to remove it as may be wrong.

In [14]:
dataset = dataset[dataset.Height != 62.0]

In [None]:
print("deleting noisy examples..")

In [15]:
dataset.shape

(1007, 150)

In [16]:
def convertSmoking(value):
    if(value=='never smoked'):
        return 1
    elif(value=='tried smoking'):
        return 2
    elif(value == 'former smoker'):
        return 3 
    elif(value=='current smoker'):
        return 4

def convertAlcohol(value):
    if(value=='never'):
        return 1
    elif(value=='social drinker'):
        return 2
    elif(value == 'drink a lot'):
        return 3 
    
def convertPunctuality(value):
    if(value=='i am often running late'):
        return 1
    elif(value=='i am often early'):
        return 2
    elif(value == 'i am always on time'):
        return 3 

def convertLying(value):
    if(value=='never'):
        return 1
    elif(value=='sometimes'):
        return 2
    elif(value == 'only to avoid hurting someone'):
        return 3 
    elif(value == 'everytime it suits me'):
        return 4

def convertInternetUsage(value):
    if(value=='few hours a day'):
        return 3
    elif(value=='less than an hour a day'):
        return 2
    elif(value == 'most of the day'):
        return 4 
    elif(value == 'no time at all'):
        return 1

def convertEducation(value):
    if(value=='currently a primary school pupil'):
        return 1
    elif(value=='primary school'):
        return 2
    elif(value == 'secondary school'):
        return 3
    elif(value == 'college/bachelor degree'):
        return 4
    elif(value == 'masters degree'):
        return 5
    elif(value == 'doctorate degree'):
        return 6

In [17]:
dataset['Smoking'] = dataset.Smoking.apply(convertSmoking)
dataset['Alcohol'] = dataset.Alcohol.apply(convertAlcohol)
dataset['Lying'] = dataset.Lying.apply(convertLying)
dataset['Punctuality'] = dataset.Punctuality.apply(convertPunctuality)
dataset['Education'] = dataset.Education.apply(convertEducation)
dataset['Internet usage'] = dataset['Internet usage'].apply(convertInternetUsage)

In [18]:
def GenderConversion(value):
    if value=='male':
        return 1
    elif value == 'female':
        return 0

def OnlyChildConversion(value):
    if value=='yes':
        return 1
    elif value == 'no':
        return 0
    
def LRHandedConversion(value):
    if value=='right handed':
        return 0
    elif value == 'left handed':
        return 1

def VillageTownConversion(value):
    if value=='city':
        return 0
    elif value == 'village':
        return 1

def HouseBoFlatsConversion(value):
    if value=='house/bungalow':
        return 1
    elif value == 'block of flats':
        return 0

def Range1_5ValuesConversion(value):
    if value==4 or value==5:
        return 1
    elif value == 1 or value == 2 or value == 3:
        return 0

In [19]:
dataset['Gender'] = dataset['Gender'].apply(GenderConversion)
dataset['Only child'] = dataset['Only child'].apply(OnlyChildConversion)
dataset['Left - right handed'] = dataset['Left - right handed'].apply(LRHandedConversion)
dataset['Village - town'] = dataset['Village - town'].apply(VillageTownConversion)
dataset['House - block of flats'] = dataset['House - block of flats'].apply(HouseBoFlatsConversion)
dataset['Empathy'] = dataset['Empathy'].apply(Range1_5ValuesConversion)

In [None]:
print("encoding attributes...")

# Missing values analysis

In [20]:
df_na = (dataset.isnull().sum() / len(dataset)) * 100
df_na = df_na.drop(df_na[df_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio': df_na})
missing_data

Unnamed: 0,Missing Ratio
Weight,1.986097
Height,1.886792
Passive sport,1.489573
Chemistry,0.993049
Geography,0.893744
Theatre,0.794439
Latino,0.794439
Smoking,0.794439
Documentary,0.794439
Punk,0.794439


In [21]:
dataset = dataset.apply(lambda x: x.fillna(x.median()),axis=0)

In [22]:
dataset.isnull().values.any()

False

In [None]:
print("imputing missing values...")

In [None]:
print("saving the new preprocessed dataset as 'preprocessedDataset2'")

In [23]:
dataset.to_csv("preprocessedDataset2.csv")