In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

sns.set(rc={'figure.figsize': [10, 10]}, font_scale=1.2)

In [2]:
df = pd.read_csv('friends.csv', sep='|')
df

Unnamed: 0,fname,lname,age_sex,section,height(cm),weight(kg),age,spend_A,spend_B,spend_C
0,Rahul,Pandey,37_M,B,175.4,85.6,37,1500.0,200.0,300
1,Rakesh,Kumar,38_M,B,,xx,38,,1000.0,
2,Nitesh,Mondol,35_F,B,165.1,?,35,500.0,-100.0,200
3,Arvind,Dwivedi,40_M,A,0,55.6,40,300.0,300.0,
4,Nikhil,Shikharwar,39_M,B,160.8,160,39,200.0,,300
5,Ashutosh,Merothiya,36_M,A,xx,-60,36,100.0,400.0,
6,Shubhro,Das,35_F,?,170.3,75.5,35,,500.0,500
7,Suvendu,Das,34_M,C,155.1,56.8,34,1000.0,100.0,xx
8,Swadesh,Kumar,36_M,C,154.2,60.6,36,100.0,200.0,300
9,Arun,Vishwakarma,38_F,xx,162.1,58.7,38,600.0,200.0,200


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   fname       10 non-null     object 
 1   lname       10 non-null     object 
 2   age_sex     10 non-null     object 
 3   section     10 non-null     object 
 4   height(cm)  9 non-null      object 
 5   weight(kg)  10 non-null     object 
 6   age         10 non-null     int64  
 7   spend_A     8 non-null      float64
 8   spend_B     9 non-null      float64
 9   spend_C     7 non-null      object 
dtypes: float64(2), int64(1), object(7)
memory usage: 928.0+ bytes


#### Work with numerical features
- `to_numeric` to make sure the column have numeric values only.
- use `describe()` to analyize the numbers.

In [4]:
numerical_columns = ['height(cm)', 'weight(kg)', 'age', 'spend_A', 'spend_B', 'spend_C']

for col in numerical_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   fname       10 non-null     object 
 1   lname       10 non-null     object 
 2   age_sex     10 non-null     object 
 3   section     10 non-null     object 
 4   height(cm)  8 non-null      float64
 5   weight(kg)  8 non-null      float64
 6   age         10 non-null     int64  
 7   spend_A     8 non-null      float64
 8   spend_B     9 non-null      float64
 9   spend_C     6 non-null      float64
dtypes: float64(5), int64(1), object(4)
memory usage: 928.0+ bytes


In [6]:
df

Unnamed: 0,fname,lname,age_sex,section,height(cm),weight(kg),age,spend_A,spend_B,spend_C
0,Rahul,Pandey,37_M,B,175.4,85.6,37,1500.0,200.0,300.0
1,Rakesh,Kumar,38_M,B,,,38,,1000.0,
2,Nitesh,Mondol,35_F,B,165.1,,35,500.0,-100.0,200.0
3,Arvind,Dwivedi,40_M,A,0.0,55.6,40,300.0,300.0,
4,Nikhil,Shikharwar,39_M,B,160.8,160.0,39,200.0,,300.0
5,Ashutosh,Merothiya,36_M,A,,-60.0,36,100.0,400.0,
6,Shubhro,Das,35_F,?,170.3,75.5,35,,500.0,500.0
7,Suvendu,Das,34_M,C,155.1,56.8,34,1000.0,100.0,
8,Swadesh,Kumar,36_M,C,154.2,60.6,36,100.0,200.0,300.0
9,Arun,Vishwakarma,38_F,xx,162.1,58.7,38,600.0,200.0,200.0


In [7]:
df.describe()

Unnamed: 0,height(cm),weight(kg),age,spend_A,spend_B,spend_C
count,8.0,8.0,10.0,8.0,9.0,6.0
mean,142.875,61.6,36.8,537.5,311.111111,300.0
std,58.1695,60.126985,1.932184,492.624169,310.017921,109.544512
min,0.0,-60.0,34.0,100.0,-100.0,200.0
25%,154.875,56.5,35.25,175.0,200.0,225.0
50%,161.45,59.65,36.5,400.0,200.0,300.0
75%,166.4,78.025,38.0,700.0,400.0,300.0
max,175.4,160.0,40.0,1500.0,1000.0,500.0


In [8]:
df['height(cm)'] = df['height(cm)'].apply(lambda x: np.nan if x <= 90 else x)

In [9]:
df.describe()

Unnamed: 0,height(cm),weight(kg),age,spend_A,spend_B,spend_C
count,7.0,8.0,10.0,8.0,9.0,6.0
mean,163.285714,61.6,36.8,537.5,311.111111,300.0
std,7.707016,60.126985,1.932184,492.624169,310.017921,109.544512
min,154.2,-60.0,34.0,100.0,-100.0,200.0
25%,157.95,56.5,35.25,175.0,200.0,225.0
50%,162.1,59.65,36.5,400.0,200.0,300.0
75%,167.7,78.025,38.0,700.0,400.0,300.0
max,175.4,160.0,40.0,1500.0,1000.0,500.0


In [10]:
df['weight(kg)'] = df['weight(kg)'].apply(lambda x: abs(x))
df.describe()

Unnamed: 0,height(cm),weight(kg),age,spend_A,spend_B,spend_C
count,7.0,8.0,10.0,8.0,9.0,6.0
mean,163.285714,76.6,36.8,537.5,311.111111,300.0
std,7.707016,35.300304,1.932184,492.624169,310.017921,109.544512
min,154.2,55.6,34.0,100.0,-100.0,200.0
25%,157.95,58.225,35.25,175.0,200.0,225.0
50%,162.1,60.3,36.5,400.0,200.0,300.0
75%,167.7,78.025,38.0,700.0,400.0,300.0
max,175.4,160.0,40.0,1500.0,1000.0,500.0


In [11]:
df['spend_B'] = df['spend_B'].apply(lambda x: abs(x))
df.describe()

Unnamed: 0,height(cm),weight(kg),age,spend_A,spend_B,spend_C
count,7.0,8.0,10.0,8.0,9.0,6.0
mean,163.285714,76.6,36.8,537.5,333.333333,300.0
std,7.707016,35.300304,1.932184,492.624169,282.842712,109.544512
min,154.2,55.6,34.0,100.0,100.0,200.0
25%,157.95,58.225,35.25,175.0,200.0,225.0
50%,162.1,60.3,36.5,400.0,200.0,300.0
75%,167.7,78.025,38.0,700.0,400.0,300.0
max,175.4,160.0,40.0,1500.0,1000.0,500.0


In [12]:
df

Unnamed: 0,fname,lname,age_sex,section,height(cm),weight(kg),age,spend_A,spend_B,spend_C
0,Rahul,Pandey,37_M,B,175.4,85.6,37,1500.0,200.0,300.0
1,Rakesh,Kumar,38_M,B,,,38,,1000.0,
2,Nitesh,Mondol,35_F,B,165.1,,35,500.0,100.0,200.0
3,Arvind,Dwivedi,40_M,A,,55.6,40,300.0,300.0,
4,Nikhil,Shikharwar,39_M,B,160.8,160.0,39,200.0,,300.0
5,Ashutosh,Merothiya,36_M,A,,60.0,36,100.0,400.0,
6,Shubhro,Das,35_F,?,170.3,75.5,35,,500.0,500.0
7,Suvendu,Das,34_M,C,155.1,56.8,34,1000.0,100.0,
8,Swadesh,Kumar,36_M,C,154.2,60.6,36,100.0,200.0,300.0
9,Arun,Vishwakarma,38_F,xx,162.1,58.7,38,600.0,200.0,200.0


#### Work with categorical features
- use `unique()` to analyize the column values.

In [13]:
categorical_columns = ['fname', 'lname', 'section']

In [14]:
for col in categorical_columns:
    print(f'------ {col} ------')
    print(df[col].unique())
    print('----------------------------------')

------ fname ------
['Rahul' 'Rakesh' 'Nitesh' 'Arvind' 'Nikhil' 'Ashutosh' 'Shubhro'
 'Suvendu' 'Swadesh' 'Arun']
----------------------------------
------ lname ------
['Pandey' 'Kumar' 'Mondol' 'Dwivedi' 'Shikharwar' 'Merothiya' 'Das'
 'Vishwakarma']
----------------------------------
------ section ------
['B' 'A' '?' 'C' 'xx']
----------------------------------


In [15]:
df['section'] = df['section'].apply(lambda x: np.nan if x in ['xx', '?'] else x)
df

Unnamed: 0,fname,lname,age_sex,section,height(cm),weight(kg),age,spend_A,spend_B,spend_C
0,Rahul,Pandey,37_M,B,175.4,85.6,37,1500.0,200.0,300.0
1,Rakesh,Kumar,38_M,B,,,38,,1000.0,
2,Nitesh,Mondol,35_F,B,165.1,,35,500.0,100.0,200.0
3,Arvind,Dwivedi,40_M,A,,55.6,40,300.0,300.0,
4,Nikhil,Shikharwar,39_M,B,160.8,160.0,39,200.0,,300.0
5,Ashutosh,Merothiya,36_M,A,,60.0,36,100.0,400.0,
6,Shubhro,Das,35_F,,170.3,75.5,35,,500.0,500.0
7,Suvendu,Das,34_M,C,155.1,56.8,34,1000.0,100.0,
8,Swadesh,Kumar,36_M,C,154.2,60.6,36,100.0,200.0,300.0
9,Arun,Vishwakarma,38_F,,162.1,58.7,38,600.0,200.0,200.0
