In [691]:
# Importing whatever is necessary at the begining
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing

### Dataset 1: Airline Travelling By Age

### To find the pattern between age and distance travelled

In [692]:
data = pd.read_csv('Airsat21.csv')
print("\t\t\t\tBASIC DATAFRAME EXPLORATION : AIRLINE SATISFACTION")
data.head()

In [693]:
data.tail()

In [694]:
data.dtypes


In [695]:
print(data.columns)

In [696]:
print("\t\t\t\t\tStatistical Summary of Numeric Columns")
data.describe()

In [697]:
data.Gender.value_counts()

In [698]:
data.Customer_Type.value_counts()

In [699]:
data.Type_of_Travel.value_counts()

In [700]:
data.Class.value_counts()

In [701]:
data.satisfaction   .value_counts()

In [702]:
print("\t\t\t\t\tData Visualisation")

In [703]:
# Univariate Histogram
plt.figure(figsize=(8,5))
sns.histplot(data['Age'], bins=20, kde=True)
plt.title('Histogram of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()  # Shows age group of travellers

In [704]:
plt.figure(figsize=(8,5))
sns.histplot(data['Flight_Distance'], bins=20, kde=True)
plt.title('Histogram of Flight Distance')
plt.xlabel('Flight_Distance')
plt.ylabel('Frequency')
plt.show()

In [705]:
# Univariate Pie Chart
plt.figure(figsize=(8,5))
data['Class'].value_counts().plot(kind='pie',autopct='%1.1f%%',colors=sns.color_palette('pastel'))
plt.title('Pie Chart Of Class Of Travel')
plt.ylabel('')
plt.show()

In [706]:
# Univariate Box Plot
plt.figure(figsize=(8,5))
sns.boxplot(x=data['Age'])
plt.title('Box plot Of Age')
plt.xlabel('Age')
plt.show()

In [707]:
plt.figure(figsize=(8,5))
sns.boxplot(x=data['Flight_Distance'])
plt.title('Box plot Of Flight Distance')
plt.xlabel('Flight_Distance')
plt.show()

In [708]:
# Bivariate Plot Line
sns.lineplot(x='Age',y='Flight_Distance',data=data, errorbar=None)

In [709]:
plt.scatter(x='Age',y='Flight_Distance', data=data)

In [710]:
sns.scatterplot(x='Age',y='Flight_Distance', data=data)

In [711]:
# Bivariate Scatter Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Age', y='Flight_Distance', hue='Class', data=data)
plt.title('Scatter Plot of Age vs Flight Distance (Colored by Class)')
plt.xlabel('Age')
plt.ylabel('Flight_Distance')
plt.legend(title='Class')
plt.show()

In [712]:
print("\t\t\t\t\t\tData Preprocessing")

In [713]:
data= data.drop(columns=['id','Customer_Type','Inflight_wifi_service','Ease_of_Online_booking','Departure/Arrival_time_convenient','Gate_location','Food_and_drink','Online_boarding','Seat_comfort','Inflight_entertainment','On-board_service','Leg_room_service','Baggage_handling','Checkin_service','Inflight_service','Cleanliness','Departure_Delay_in_Minutes','Arrival_Delay_in_Minutes'])

In [714]:
data.shape

In [715]:
data = data.dropna()
data.shape

In [716]:
data_num = data.select_dtypes(include='number')
data_num.head() # We are moving numeric values to another variable

In [717]:
# Outlier Filter
Q1= data_num.quantile(0.25)
Q3= data_num.quantile(0.75)
IQR=Q3-Q1
lower_bound=Q1-1.5*IQR
upper_bound=Q3+1.5*IQR

In [718]:
lower_bound

In [719]:
upper_bound

In [720]:
data = data[~((data_num < lower_bound) | (data_num > upper_bound)).any(axis=1)]

In [721]:
data.shape

In [722]:
sns.boxplot(x=data['Flight_Distance'])

In [723]:
data_num = data.select_dtypes(include='number')
data_num.head()

In [724]:
Q1= data_num.quantile(0.25)
Q3= data_num.quantile(0.75)
IQR=Q3-Q1
lower_bound=Q1-1.5*IQR
upper_bound=Q3+1.5*IQR

In [725]:
data = data[~((data_num < lower_bound) | (data_num > upper_bound)).any(axis=1)]

In [726]:
data.shape

In [727]:
sns.boxplot(x=data['Flight_Distance'])

In [728]:
data_num = data.select_dtypes('number')
data_num.shape

In [729]:
X = data.drop('Age', axis=1)
Y = data.Age
X.head()

In [730]:
X_num = X.select_dtypes('number')
X_num.head()

In [731]:
X_cat = X.select_dtypes('object')
X_cat.head()

In [732]:
# Rescaling Numeric Columns
scaler = MinMaxScaler()
X_num_scaled = scaler.fit_transform(X_num)
type(X_num_scaled)

In [733]:
X_num_scaled = pd.DataFrame(X_num_scaled, columns=X_num.columns, index=X_num.index)  # Converting the array to dataframe

In [734]:
X_num_scaled.describe()

In [735]:
X_cat_encoded = pd.get_dummies(X_cat, drop_first=False, dtype=int)  # One Hot Code Category Columns

In [736]:
X = pd.concat([X_num_scaled, X_cat_encoded], axis=1) # Merging data_num and data_cat into X

In [737]:
 X.shape, X_num_scaled.shape, X_cat_encoded.shape

In [738]:
print("\t\t\t\tChecking for NAs and Shape Compatibility")
print(X.isnull().sum())
print(Y.isnull().sum())
print(X.shape)
print(Y.shape)

In [739]:
X.describe()

In [740]:
print("\t\t\t\t\tTrain-Test Split")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print("X train:",X_train.shape)
print("X test:",X_test.shape)
print("Y train:",Y_train.shape) 
print("Y test:",Y_test.shape)

### Dataset 2: Bangalore Housing (Prices)

### To find the pattern in pricing trends in accordance to the number of rooms

In [741]:
bhk = pd.read_csv('BangaloreHousing.csv')
print("\t\t\t\tBASIC DATAFRAME EXPLORATION : BANGALORE HOUSING - PRICES")
bhk.head()

In [742]:
bhk.tail()

In [743]:
bhk.shape

In [744]:
bhk.dtypes

In [745]:
print(bhk.columns)

In [746]:
print("\t\t\t\t\tStatistical Summary of Numeric Columns")
bhk.describe()

In [747]:
bhk.Price.value_counts()

In [748]:
bhk.Area.value_counts()

In [749]:
bhk.No_of_Bedrooms.value_counts()

In [750]:
bhk.MaintenanceStaff .value_counts()

In [751]:
bhk.Gymnasium .value_counts()

In [752]:
bhk.RainWaterHarvesting.value_counts()

In [753]:
bhk.IndoorGames.value_counts()

In [754]:
bhk.ATM.value_counts()

In [755]:
bhk.ClubHouse.value_counts()

In [756]:
bhk.School.value_counts()

In [757]:
bhk.Security.value_counts()

In [758]:
bhk.PowerBackup.value_counts()

In [759]:
print("\t\t\t\t\tData Visualisation")

In [760]:
# Univariate Histogram
plt.figure(figsize=(8,5))
sns.histplot(bhk['Price'], bins=20, kde=True)
plt.title('Histogram of Price')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()  # Shows age group of travellers

In [761]:
plt.figure(figsize=(8,5))
sns.histplot(bhk['No_of_Bedrooms'], bins=5, kde=True)
plt.title('Histogram of Rooms')
plt.xlabel('No_of_Bedrooms')
plt.ylabel('Frequency')
plt.show()

In [762]:
plt.figure(figsize=(8,5))
sns.histplot(bhk['Security'], bins=10, kde=True)
plt.title('Histogram of Security')
plt.xlabel('Security')
plt.ylabel('Frequency')
plt.show()

In [763]:
# Univariate Pie Chart
plt.figure(figsize=(8,10))
bhk['No_of_Bedrooms'].value_counts().plot(kind='pie',autopct='%1.1f%%',colors=sns.color_palette(''))
plt.title('Pie Chart Of No. of Bedrooms ')
plt.ylabel('')
plt.show()

In [None]:
# Univariate Box Plot
plt.figure(figsize=(8,5))
sns.boxplot(x=bhk['Price'])
plt.title('Box plot Of Price')
plt.xlabel('Price')
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(x=bhk['No_of_Bedrooms'])
plt.title('Box plot Of No_of_Bedroom')
plt.xlabel('No_of_Bedrooms')
plt.show()

In [None]:
# Bivariate Plot Line
sns.lineplot(x='Price',y='No_of_Bedrooms',data=bhk, errorbar=None)

In [None]:
plt.scatter(x='Price',y='No_of_Bedrooms', data=bhk)

In [None]:
sns.scatterplot(x='Price',y='No_of_Bedrooms', data=bhk)

In [None]:
# Bivariate Scatter Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Price', y='No_of_Bedrooms', hue='Security', data=bhk)
plt.title('Scatter Plot of Price vs No. of Bedrooms (Colored by Security)')
plt.xlabel('Price')
plt.ylabel('No_of_Bedrooms')
plt.legend(title='Security')
plt.show()

In [None]:
print("\t\t\t\t\t\tData Preprocessing")

In [None]:
bhk = bhk.drop(columns=['Gymnasium','SwimmingPool','RainWaterHarvesting','IndoorGames','ATM','ClubHouse'])    
bhk.shape

In [None]:
bhk = bhk.dropna()
bhk.shape

In [None]:
bhk_num = bhk.select_dtypes(include='number')
bhk_num.head() # If outliers are not eliminated in first try, repeat from here.

In [None]:
Q1 = bhk_num.quantile(0.25)  # Outlier
Q3 = bhk_num.quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

In [None]:
lower

In [None]:
upper

In [None]:
bhk = bhk[~((bhk_num < lower) | (bhk_num > upper).any(axis=1))]

In [None]:
bhk.shape

In [None]:
sns.boxplot(x=bhk['Price'])

In [None]:
bhk_num = bhk.select_dtypes(include='number')
bhk_num.head()


In [None]:
Q1 = bhk_num.quantile(0.25)  # Outlier
Q3 = bhk_num.quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

In [None]:
bhk = bhk[~((bhk_num < lower) | (bhk_num > upper).any(axis=1))]

In [None]:
bhk.shape

In [None]:
bhk_num = bhk.select_dtypes(include='number')
bhk_num.head()

In [None]:
Q1 = bhk_num.quantile(0.25)  # Outlier
Q3 = bhk_num.quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

In [None]:
upper

In [None]:
bhk = bhk[~((bhk_num < lower) | (bhk_num > upper).any(axis=1))]

In [None]:
sns.boxplot(x=bhk['Price'])

In [None]:
bhk_num = bhk.select_dtypes(include='number')
bhk_num.head()

In [None]:
Q1 = bhk_num.quantile(0.25)  # Outlier
Q3 = bhk_num.quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

In [None]:
bhk = bhk[~((bhk_num < lower) | (bhk_num > upper)).any(axis=1)]

In [None]:
sns.boxplot(x=bhk['Price'])

In [None]:
bhk_num = bhk.select_dtypes(include='number')
bhk_num.head()

In [None]:
Q1 = bhk_num.quantile(0.25)  # Outlier
Q3 = bhk_num.quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

In [None]:
bhk = bhk[~((bhk_num < lower) | (bhk_num > upper)).any(axis=1)]

In [None]:
sns.boxplot(x=bhk['Price'])

In [None]:
bhk_num = bhk.select_dtypes(include='number')
bhk_num.head()

In [None]:
Q1 = bhk_num.quantile(0.25)  # Outlier
Q3 = bhk_num.quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

In [None]:
bhk = bhk[~((bhk_num < lower) | (bhk_num > upper)).any(axis=1)]

In [None]:
sns.boxplot(x=bhk['Price'])

In [None]:
bhk_num = bhk.select_dtypes('number')
bhk_num.shape

In [None]:
bhk_num = bhk.select_dtypes(include='number')
bhk_num.head()

In [None]:
Q1 = bhk_num.quantile(0.25)  # Outlier
Q3 = bhk_num.quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

In [None]:
bhk = bhk[~((bhk_num < lower) | (bhk_num > upper)).any(axis=1)]

In [None]:
sns.boxplot(x=bhk['Price'])

In [None]:
bhk_num = bhk.select_dtypes(include='number')
bhk_num.head()

In [None]:
Q1 = bhk_num.quantile(0.25)  # Outlier
Q3 = bhk_num.quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

In [None]:
bhk = bhk[~((bhk_num < lower) | (bhk_num > upper)).any(axis=1)]

In [None]:
sns.boxplot(x=bhk['Price'])

In [None]:
X = bhk.drop('Price', axis=1)
Y = bhk.Price
X.head()

In [None]:
X_num = X.select_dtypes('number')
X_num.head()

In [None]:
X_cat = X.select_dtypes('object')
X_cat.head()

In [None]:
# Rescaling Numeric Columns
scaler = MinMaxScaler()
X_num_scaled = scaler.fit_transform(X_num)
type(X_num_scaled)

In [None]:
X_num_scaled = pd.DataFrame(X_num_scaled, columns=X_num.columns, index=X_num.index)

In [None]:
X_num_scaled.describe()

In [None]:
X_cat_encoded = pd.get_dummies(X_cat, drop_first=False, dtype=int)  # One Hot Code Category Columns

In [None]:
X = pd.concat([X_num_scaled, X_cat_encoded], axis=1)

In [None]:
 X.shape, X_num_scaled.shape, X_cat_encoded.shape

In [None]:
print("\t\t\t\tChecking for NAs and Shape Compatibility")
print(X.isnull().sum())
print(Y.isnull().sum())
print(X.shape)
print(Y.shape)

In [None]:
X.describe()

In [None]:
print("\t\t\t\t\tTrain-Test Split")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print("X train:",X_train.shape)
print("X test:",X_test.shape)
print("Y train:",Y_train.shape) 
print("Y test:",Y_test.shape)

### Dataset 3 : Cigars

### To find the pattern between cigar prices and sales

In [None]:
cgr = pd.read_csv('Cigar.csv')
print("\t\t\t\tBASIC DATAFRAME EXPLORATION : Cigars")
cgr.head()

In [None]:
cgr.tail()

In [None]:
cgr.dtypes

In [None]:
cgr.shape

In [None]:
print(cgr.columns)

In [None]:
print("\t\t\t\tStep 2: Summary Statistics of Numeric Columns")
cgr.describe()

In [None]:
cgr.Num.value_counts()

In [None]:
cgr.state.value_counts()

In [None]:
cgr.year.value_counts()

In [None]:
cgr.price.value_counts()

In [None]:
cgr.Pop.value_counts()

In [None]:
cgr.pop16.value_counts()

In [None]:
cgr.cpi.value_counts()

In [None]:
cgr.ndi.value_counts()

In [None]:
cgr.sales .value_counts()

In [None]:
cgr.pimin.value_counts()

In [None]:
print("\t\t\t\t\t\tData Visualisation")

In [None]:
# Univariate Histogram
plt.figure(figsize=(8, 5))
sns.histplot(cgr['price'], bins=20, kde=True)
plt.title('Histogram of Price')
plt.xlabel('price')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Univariate Histogram
plt.figure(figsize=(8, 5))
sns.histplot(cgr['sales'], bins=20, kde=True)
plt.title('Histogram of Sales')
plt.xlabel('sales')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Univariate Box Plot
plt.figure(figsize=(8, 5))
sns.boxplot(x=cgr['price'])
plt.title('Box Plot of Price')
plt.xlabel('price')
plt.show()

In [None]:
# Univariate Box Plot
plt.figure(figsize=(8, 5))
sns.boxplot(x=cgr['sales'])
plt.title('Box Plot of Sales')
plt.xlabel('sales')
plt.show()

In [None]:
plt.scatter(x='price', y='sales', data=cgr)

In [None]:
sns.scatterplot(x='price', y='sales', data=cgr)

In [None]:
# Bivariate Scatter Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='price', y='sales', hue='year', data=cgr)
plt.title('Scatter Plot of Prices vs Sales (Colored by Year)')
plt.xlabel('price')
plt.ylabel('sales')
plt.legend(title='year')
plt.show()

In [None]:
print("\t\t\t\t\tData Preprocessing")

In [None]:
cgr = cgr.drop(columns=['Num', 'state','pop16','pimin'])

In [None]:
cgr.shape

In [None]:
cgr = cgr.dropna()

In [None]:
cgr.shape

In [None]:
cgr_num = cgr.select_dtypes(include='number')
cgr_num.head()

In [None]:
Q1 = cgr_num.quantile(0.25) # Outliers
Q3 = cgr_num.quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper= Q3 + 1.5 * IQR

In [None]:
lower

In [None]:
upper

In [None]:
cgr = cgr[~((cgr_num < lower) | (cgr_num > upper)).any(axis=1)]

In [None]:
cgr.shape

In [None]:
sns.boxplot(x=cgr['price'])

In [None]:
cgr_num = cgr.select_dtypes(include='number')
cgr_num.head()

In [None]:
Q1 = cgr_num.quantile(0.25) # Outliers
Q3 = cgr_num.quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper= Q3 + 1.5 * IQR

In [None]:
cgr = cgr[~((cgr_num < lower) | (cgr_num > upper)).any(axis=1)]

In [None]:
cgr.shape

In [None]:
sns.boxplot(x=cgr['sales'])

In [None]:
X = cgr.drop('price', axis=1)
Y = cgr.price
X.head()

In [None]:
X_num = X.select_dtypes('number')
X_num.head()

In [None]:
X_cat = X.select_dtypes('object')
X_cat.head()

In [None]:
scaler = MinMaxScaler()
X_num_scaled = scaler.fit_transform(X_num)

In [None]:
type(X_num_scaled)

In [None]:
X_num_scaled = pd.DataFrame(X_num_scaled, columns=X_num.columns, index=X_num.index)

In [None]:
X_num_scaled.describe()

In [None]:
X = pd.concat([X_num_scaled], axis=1)

In [None]:
 X.shape, X_num_scaled.shape

In [None]:
print("\t\t\t\t Checking for NAs and Shape Compatibility")
print(X.isnull().sum())
print(Y.isnull().sum())
print(X.shape)
print(Y.shape)

In [None]:
X.describe()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
print("\t\t\t\t\t\tTest-Train Split")
print("X Train:",X_train.shape)
print("X Test:",X_test.shape)
print("Y Train:",Y_train.shape) 
print("Y Test:",Y_test.shape)

### Dataset 4: Music Genre

### To find the pattern between duration of music and energy associated with it

In [None]:
cd = pd.read_csv('music_genre.csv')
cd.head()

In [None]:
cd.tail()

In [None]:
cd.shape

In [None]:
cd.dtypes

In [None]:
print(cd.columns)

In [None]:
print("\t\t\t\t\t Summary Statistics of Numeric Columns")
cd.describe()

In [None]:
cd.instance_id.value_counts()

In [None]:
cd.popularity .value_counts()

In [None]:
cd.acousticness .value_counts()

In [None]:
cd.danceability .value_counts()

In [None]:
cd.duration_ms  .value_counts()

In [None]:
cd.energy .value_counts()

In [None]:
cd.instrumentalness .value_counts()

In [None]:
cd.liveness .value_counts()

In [None]:
cd.loudness .value_counts()

In [None]:
cd.speechiness .value_counts()

In [None]:
cd.valence .value_counts()

In [None]:
print("\t\t\t\t\t\tData Visualisation")

In [None]:
# Univariate Histogram
plt.figure(figsize=(8, 5))
sns.histplot(cd['duration_ms'], bins=20, kde=True)
plt.title('Histogram of Duration in ms')
plt.xlabel('duration_ms')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Univariate Histogram
plt.figure(figsize=(8, 5))
sns.histplot(cd['energy'], bins=20, kde=True)
plt.title('Histogram of Energy')
plt.xlabel('energy')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Univariate Pie Chart
plt.figure(figsize=(8, 5))
cd['music_genre'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=sns.color_palette('pastel'))
plt.title('Pie Chart of Music Genre')
plt.ylabel('')
plt.show()

In [None]:
# Univariate Box Plot
plt.figure(figsize=(8, 5))
sns.boxplot(x=cd['duration_ms'])
plt.title('Box Plot of Duration in ms')
plt.xlabel('duration_ms')
plt.show()

In [None]:
# Univariate Box Plot
plt.figure(figsize=(8, 5))
sns.boxplot(x=cd['energy'])
plt.title('Box Plot of Energy')
plt.xlabel('energy')
plt.show()

In [None]:
sns.lineplot(x='duration_ms', y='energy', data=cd, errorbar=None)

In [None]:
sns.scatterplot(x='duration_ms', y='energy', data=cd)

In [None]:
# Bivariate Scatter Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='duration_ms', y='energy', hue='music_genre', data=cd)
plt.title('Scatter Plot of Duration in ms vs Energy (colored by Music Genre)')
plt.xlabel('duration_ms')
plt.ylabel('enegy')
plt.legend(title='music_genre')
plt.show()

In [None]:
print("\t\t\t\t\t\tData Preprocessing")

In [None]:
cd = cd.drop(columns=['instance_id', 'acousticness','danceability','instrumentalness','key','liveness','loudness','mode','speechiness','tempo','obtained_date','valence','artist_name','track_name'])

In [None]:
cd.shape

In [None]:
cd = cd.dropna()
cd.shape

In [None]:
cd_num = cd.select_dtypes(include='number')
cd_num.head()

In [None]:
Q1 = cd_num.quantile(0.25)
Q3 = cd_num.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [None]:
lower_bound

In [None]:
upper_bound

In [None]:
cd = cd[~((cd_num < lower_bound) | (cd_num > upper_bound)).any(axis=1)]

In [None]:
cd.shape

In [None]:
sns.boxplot(x=cd['duration_ms'])

In [None]:
cd_num = cd.select_dtypes(include='number')
cd_num.head()

In [None]:
Q1 = cd_num.quantile(0.25)
Q3 = cd_num.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [None]:
lower_bound

In [None]:
upper_bound

In [None]:
cd = cd[~((cd_num < lower_bound) | (cd_num > upper_bound)).any(axis=1)]

In [None]:
cd.shape

In [None]:
sns.boxplot(x=cd['duration_ms'])

In [None]:
cd_num = cd.select_dtypes(include='number')
cd_num.head()

In [None]:
Q1 = cd_num.quantile(0.25)
Q3 = cd_num.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [None]:
lower_bound

In [None]:
upper_bound

In [None]:
cd = cd[~((cd_num < lower_bound) | (cd_num > upper_bound)).any(axis=1)]

In [None]:
cd.shape

In [None]:
sns.boxplot(x=cd['duration_ms'])

In [None]:
cd_num = cd.select_dtypes(include='number')
cd_num.head()

In [None]:
Q1 = cd_num.quantile(0.25)
Q3 = cd_num.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [None]:
cd = cd[~((cd_num < lower_bound) | (cd_num > upper_bound)).any(axis=1)]

In [None]:
cd.shape

In [None]:
sns.boxplot(x=cd['duration_ms'])

In [None]:
X = cd.drop('duration_ms', axis=1)
Y = cd.duration_ms
X.head()

In [None]:
X_num = X.select_dtypes('number')
X_num.head()

In [None]:
X_cat = X.select_dtypes('object')
X_cat.head()

In [None]:
scaler = MinMaxScaler()
X_num_scaled = scaler.fit_transform(X_num)

In [None]:
type(X_num_scaled)

In [None]:
X_num_scaled = pd.DataFrame(X_num_scaled, columns=X_num.columns, index=X_num.index)

In [None]:
X_num_scaled.describe()

In [None]:
X_cat_encoded = pd.get_dummies(X_cat, drop_first=False, dtype=int)

In [None]:
X_cat.describe()

In [None]:
X = pd.concat([X_num_scaled, X_cat_encoded], axis=1)

In [None]:
 X.shape, X_num_scaled.shape, X_cat_encoded.shape

In [None]:
print("\t\t\t\t\t Checking for NAs and Shape Compatibility")
print(X.isnull().sum())
print(Y.isnull().sum())
print(X.shape)
print(Y.shape)

In [None]:
X.describe()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
print("\t\t\t\t\t\tTest-Train Split")
print("X Train:",X_train.shape)
print("X Test:",X_test.shape)
print("Y Train:",Y_train.shape) 
print("Y Test:",Y_test.shape)

### Dataset 5: Foreign Brand Cars

### To find the pattern between MSRP (MANUFACTURER'S SUGGESTED RETAIL PRICE) and Engine Horsepower

In [None]:
cr = pd.read_csv('Cars.csv')
cr.head()

In [None]:
cr.tail()

In [None]:
cr.shape

In [None]:
cr.dtypes

In [None]:
print(cr.columns)

In [None]:
print("\t\t\t\t\t Summary Statistics of Numeric Columns")
cr.describe()

In [None]:
cr.Year.value_counts()

In [None]:
cr.EngineHP.value_counts()

In [None]:
cr.Engine_Cylinders.value_counts()

In [None]:
cr.Number_of_Doors.value_counts()

In [None]:
cr.highway_MPG.value_counts()

In [None]:
cr.city_mpg.value_counts()

In [None]:
cr.Popularity .value_counts()

In [None]:
cr.MSRP .value_counts()

In [None]:
# Univariate Histogram
plt.figure(figsize=(8, 5))
sns.histplot(cr['EngineHP'], bins=20, kde=True)
plt.title('Histogram of Engine HP')
plt.xlabel('EngineHP')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Univariate Histogram
plt.figure(figsize=(8, 5))
sns.histplot(cr['MSRP'], bins=20, kde=True)
plt.title('Histogram of MSRP')
plt.xlabel('MSRP')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Univariate Pie Chart
plt.figure(figsize=(10,8))
cr['Driven_Wheels'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=sns.color_palette('pastel'))
plt.title('Pie Chart of Driven Wheel Type')
plt.ylabel('')
plt.show()

In [None]:
# Univariate Box Plot
plt.figure(figsize=(8, 5))
sns.boxplot(x=cr['EngineHP'])
plt.title('Box Plot of Engine HP')
plt.xlabel('EngineHP')
plt.show()

In [None]:
# Univariate Box Plot
plt.figure(figsize=(8, 5))
sns.boxplot(x=cr['MSRP'])
plt.title('Box Plot of MSRP')
plt.xlabel('MSRP')
plt.show()

In [None]:
sns.scatterplot(x='EngineHP', y='MSRP', data=cr)

In [None]:
# Bivariate Scatter Plot
plt.figure(figsize=(10,8))
sns.scatterplot(x='EngineHP', y='MSRP', hue='Driven_Wheels', data=cr)
plt.title('Scatter Plot of Engine HP vs MSRP (colored by Driven Wheel Type)')
plt.xlabel('EngineHP')
plt.ylabel('MSRP')
plt.legend(title='Driven_Wheels')
plt.show()

In [None]:
print("\t\t\t\t\tData Preprocessing")

In [None]:
cr = cr.drop(columns=['Model','Year','Engine_Fuel_Type','Engine_Cylinders','Transmission_Type','Number_of_Doors','Market_Category','Vehicle_Size','Popularity'])
cr.shape # If any problems occur, please go to the top of current data set and run below.

In [None]:
cr = cr.dropna()
cr.shape

In [None]:
cr_num = cr.select_dtypes(include='number')
cr_num.head()

In [None]:
Q1 = cr_num.quantile(0.25) # Outliers
Q3 = cr_num.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [None]:
lower_bound

In [None]:
upper_bound

In [None]:
cr = cr[~((cr_num < lower_bound) | (cr_num > upper_bound)).any(axis=1)]

In [None]:
cr.shape

In [None]:
sns.boxplot(x=cr['MSRP'])

In [None]:
cr_num = cr.select_dtypes(include='number')
cr_num.head()

In [None]:
Q1 = cr_num.quantile(0.25) # Outliers
Q3 = cr_num.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [None]:
cr = cr[~((cr_num < lower_bound) | (cr_num > upper_bound)).any(axis=1)]

In [None]:
cr.shape

In [None]:
sns.boxplot(x=cr['MSRP'])

In [None]:
cr_num = cr.select_dtypes(include='number')
cr_num.head()

In [None]:
Q1 = cr_num.quantile(0.25) # Outliers
Q3 = cr_num.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [None]:
cr = cr[~((cr_num < lower_bound) | (cr_num > upper_bound)).any(axis=1)]

In [None]:
cr.shape

In [None]:
sns.boxplot(x=cr['MSRP'])

In [None]:
cr_num = cr.select_dtypes(include='number')
cr_num.head()

In [None]:
Q1 = cr_num.quantile(0.25) # Outliers
Q3 = cr_num.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [None]:
cr = cr[~((cr_num < lower_bound) | (cr_num > upper_bound)).any(axis=1)]

In [None]:
cr.shape

In [None]:
sns.boxplot(x=cr['MSRP'])

In [None]:
X = cr.drop('MSRP', axis=1)
y = cr.MSRP
X.head()

In [None]:
X_num = X.select_dtypes('number')
X_num.head()

In [None]:
X_cat = X.select_dtypes('object')
X_cat.head()

In [None]:
scaler = MinMaxScaler()
X_num_scaled = scaler.fit_transform(X_num)

In [None]:
type(X_num_scaled)

In [None]:
X_num_scaled = pd.DataFrame(X_num_scaled, columns=X_num.columns, index=X_num.index)

In [None]:
X_num_scaled.describe()

In [None]:
X_cat_encoded = pd.get_dummies(X_cat, drop_first=False, dtype=int)

In [None]:
X = pd.concat([X_num_scaled, X_cat_encoded], axis=1)

In [None]:
 X.shape, X_num_scaled.shape, X_cat_encoded.shape

In [None]:
print("\t\t\t\t\t Checking for NAs and Shape Compatibility")
print(X.isnull().sum())
print(y.isnull().sum())
print(X.shape)
print(y.shape)

In [None]:
X.describe()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print("\t\t\t\t\t\tTest-Train Split")
print("X Train:",X_train.shape)
print("X Test:",X_test.shape)
print("y Train:",y_train.shape) 
print("y Test:",y_test.shape)