Linear Regression

Mean, Median, Mode, Normal Distribution, Standard Deviation, Inter Quartile Range - Box Plot

# Import Packages

In [1]:
import pandas as pd
import numpy as np

# Import Data

In [2]:
df = pd.read_csv('bike_buyers.csv')

# Check Data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                1000 non-null   int64  
 1   Marital Status    993 non-null    object 
 2   Gender            989 non-null    object 
 3   Income            994 non-null    float64
 4   Children          992 non-null    float64
 5   Education         1000 non-null   object 
 6   Occupation        1000 non-null   object 
 7   Home Owner        996 non-null    object 
 8   Cars              991 non-null    float64
 9   Commute Distance  1000 non-null   object 
 10  Region            1000 non-null   object 
 11  Age               992 non-null    float64
 12  Purchased Bike    1000 non-null   object 
dtypes: float64(4), int64(1), object(8)
memory usage: 101.7+ KB


# 1/ Missing Values

## Are there missing values?

In [4]:
df.isnull().sum()

ID                   0
Marital Status       7
Gender              11
Income               6
Children             8
Education            0
Occupation           0
Home Owner           4
Cars                 9
Commute Distance     0
Region               0
Age                  8
Purchased Bike       0
dtype: int64

## Treat Numerical Missing Values

In [5]:
# Calculate the mean of the "Income" column
mean_value = df['Income'].mean()

# Fill the missing values in the "Income" column with the calculated mean
df['Income'].fillna(mean_value, inplace=True)

mean_value

56267.605633802814

In [6]:
df.isnull().sum()

ID                   0
Marital Status       7
Gender              11
Income               0
Children             8
Education            0
Occupation           0
Home Owner           4
Cars                 9
Commute Distance     0
Region               0
Age                  8
Purchased Bike       0
dtype: int64

In [7]:
# Calculate the mean of the "Income" column
mean_value = df['Age'].mean()

# Fill the missing values in the "Income" column with the calculated mean
df['Age'].fillna(mean_value, inplace=True)

mean_value

44.181451612903224

## Treat Categorical Missing Values 

In [8]:
df['Marital Status'].fillna( 'Missing' , inplace = True)
df['Gender'].fillna( 'Missing' , inplace = True)
df['Home Owner'].fillna( 'Missing' , inplace = True)
df['Cars'].fillna( 'Missing' , inplace = True)

### Check for missing values

In [9]:
df.isnull().sum()

ID                  0
Marital Status      0
Gender              0
Income              0
Children            8
Education           0
Occupation          0
Home Owner          0
Cars                0
Commute Distance    0
Region              0
Age                 0
Purchased Bike      0
dtype: int64

# 2/ Feature Transformation
- Numerical - Transformation
- Categorical - Encoding

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                1000 non-null   int64  
 1   Marital Status    1000 non-null   object 
 2   Gender            1000 non-null   object 
 3   Income            1000 non-null   float64
 4   Children          992 non-null    float64
 5   Education         1000 non-null   object 
 6   Occupation        1000 non-null   object 
 7   Home Owner        1000 non-null   object 
 8   Cars              1000 non-null   object 
 9   Commute Distance  1000 non-null   object 
 10  Region            1000 non-null   object 
 11  Age               1000 non-null   float64
 12  Purchased Bike    1000 non-null   object 
dtypes: float64(3), int64(1), object(9)
memory usage: 101.7+ KB


In [11]:
df.describe()

Unnamed: 0,ID,Income,Children,Age
count,1000.0,1000.0,992.0,1000.0
mean,19965.992,56267.605634,1.910282,44.181452
std,5347.333948,30974.380206,1.62691,11.316422
min,11000.0,10000.0,0.0,25.0
25%,15290.75,30000.0,0.0,35.0
50%,19744.0,60000.0,2.0,43.0
75%,24470.75,70000.0,3.0,52.0
max,29447.0,170000.0,5.0,89.0


## Transform Numerical into Log

In [14]:
df['Income'] = np.log(df['Income'])

In [15]:
df['Children'] = np.log(df['Children']+1)

In [16]:
df['Age'] = np.log(df['Age'])

## Outliers

In [17]:
# Calculate the first and third quartiles (Q1 and Q3)
Q1 = df['Income'].quantile(0.25)
Q3 = df['Income'].quantile(0.75)

# Calculate the Interquartile Range (IQR)
IQR = Q3 - Q1

# Define the bounds for the outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Replace values below the lower bound with the lower bound itself
df['Income'] = df['Income'].apply(lambda x: lower_bound if x < lower_bound else x)

# Replace values above the upper bound with the upper bound itself
df['Income'] = df['Income'].apply(lambda x: upper_bound if x > upper_bound else x)

In [18]:
df.describe()

Unnamed: 0,ID,Income,Children,Age
count,1000.0,1000.0,992.0,1000.0
mean,19965.992,2.3738,0.88614,3.755938
std,5347.333948,0.062101,0.632541,0.254891
min,11000.0,2.220327,0.0,3.218876
25%,15290.75,2.333013,0.0,3.555348
50%,19744.0,2.398086,1.098612,3.7612
75%,24470.75,2.412,1.386294,3.951244
max,29447.0,2.48853,1.791759,4.488636


## Create a House Owner /Bike Owner Flag

## Encode Categorical into Ordinal Encoding

# Export Cleaned Data