In [None]:
# Basic Imports
# For numeric/scientific calculations
import numpy as np
# For Data Reading and preprocessing
import pandas as pd
# For Data Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# To ignore warnings
import warnings
warnings.filterwarnings('ignore')
# For Machine learning packacges we use scikit-lean. It will be shown when needed

In [None]:
# Reading the file
adult_df = pd.read_csv("adult.csv")
comp_df = pd.read_csv("computers.csv")
cars_df = pd.read_csv("Cars93.csv")

In [None]:
# Checking the shape of the data sets
print('Shape of adult dataframe:',adult_df.shape)
print('*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*')
print('Shape of cars93 dataframe:',cars_df.shape)
print('*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*')
print('Shape of computers dataframe:',comp_df.shape)
print('*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*')

In [None]:
# Visualizing the data frames:
adult_df.head()

In [None]:
cars_df.head()

In [None]:
comp_df.head()

In [None]:
# Gathering information about the columns:
adult_df.info()

In [None]:
cars_df.info() 

In [None]:
comp_df.info()

In [None]:
# Kindly note that any string data type will be mentioned as object and not as string
# int 32 and int 64 have different item size. This holds good for other data types as well

In [None]:
# Encoding unknown values present in adult dataset: '?' to 'NaN'
adult_df[adult_df == '?'] = np.nan

In [None]:
# Checking whether changes are made:
adult_df.info()

In [None]:
# we can see that there are null object now

In [None]:
adult_df.isna().sum()

In [None]:
adult_df['workclass'].values

In [None]:
# seeing the values we can be sure that work class is a categorical variable
# thus to fill in null values, it is more feasible to use the central tendency: 'mode'
adult_df['workclass'].fillna(adult_df['workclass'].mode()[0], inplace=True)

In [None]:
# checking whether changes made are effective:
adult_df.info()

In [None]:
# Similarly for occupation and country:
adult_df['occupation'].values

In [None]:
adult_df['occupation'].fillna(adult_df['occupation'].mode()[0], inplace=True)

In [None]:
adult_df['native.country'].values

In [None]:
adult_df['native.country'].fillna(adult_df['native.country'].mode()[0], inplace=True)

In [None]:
adult_df.isna().sum()

In [None]:
adult_df.info()

In [None]:
# thus all null objects are removed
# seeing the columns and deciding the target variable
adult_df.columns

In [None]:
# it can be inferred that income can be selected as a target variable or dependent variable (y)
# the othe columns are treated as independent variables (X)
# Splitting the dataset into independent and dependent variables is done as shown below
X = adult_df.drop(['income'], axis=1)
y = adult_df['income']

In [None]:
# Splitting up of the data set into test data and traning data:
from sklearn.model_selection import train_test_split
# We shall split the dataset into 2 parts: 80% - training data and 20% - test data
# thus test size is 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

In [None]:
# Feature engineering:
# Based on column names, we can determine which are the categorical variables. 
# But we have also seen that those columns have values in terms of string data type.
# thus it is more feasible to encode those values into numbers.
# we shall also scale those numbers so as to obtain 
from sklearn import preprocessing

categorical = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 
               'race', 'sex', 'native.country']
for feature in categorical:
        le = preprocessing.LabelEncoder()
        X_train[feature] = le.fit_transform(X_train[feature])
        X_test[feature] = le.fit_transform(X_test[feature])

In [None]:
# scaling:
# u can use min max scalar or standard scalar
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X.columns)

In [None]:
# Visualising the changes made:
X_train.head()

In [None]:
# for PCA (Principal component analysis)
from sklearn.decomposition import PCA
pca = PCA()
X_train = pca.fit_transform(X_train)
pca.explained_variance_ratio_
# explained_variance_ratio_ => indicates the proportion 
# of the dataset’s variance that lies along the axis of each principal component.

In [None]:
# We shall now move on to regression
# Fir, let us see the data set:
comp_df.head()

In [None]:
# Data Visualization
plt.scatter(comp_df['Units'], comp_df['Minutes'])
plt.xlabel("Number of Units")
plt.ylabel("Time to repair in Minutes")
plt.title("")
plt.show()

In [None]:
# Upon observation of the plot, we can be sure that the association is linear
# we shall check for the linear relation coefficient to be sure
comp_df.corr(method  = "spearman")

In [None]:
# we observe that there exists a high correlation between the two variables considered.
# Thus we can proceed with linear model
# Do note that the mean or median is not the right way of summarising for the given data.
# It is more feasible to build a model for the same.
# You can observe in the graph below for justification:
comp_mean = comp_df.Minutes.mean()
comp_df['MeanTime'] = comp_mean

In [None]:
fig,ax = plt.subplots()
ax.scatter(x="Units",y="Minutes",data=comp_df)
ax.add_line(plt.Line2D(comp_df.Units,comp_df.MeanTime,color="red"))

In [None]:
# Calculating error based on mean model:
comp_error_df = pd.DataFrame(np.array([comp_df.Units,
              comp_df.Minutes,
              comp_df.MeanTime,
              comp_df.MeanTime - comp_df.Minutes]).T,
              columns=["Units", "Actual time", "Predicted time", "Error"])
comp_error_df

In [None]:
comp_error_df['Error'].sum()

In [None]:
# we observe that the summation of errors is close to zero, 
# thus it is more feasible to have an error in the form of higher power. 
# Hence we use Mean squared error, RMSE, etc.

comp_error_df = pd.DataFrame(np.array([comp_df.Units,
              comp_df.Minutes,
              comp_df.MeanTime,
              comp_df.MeanTime - comp_df.Minutes,
              (comp_df.MeanTime - comp_df.Minutes)**2]).T,
              columns=["Units", "Actual time", "Predicted time", "Error", "Sq.Error"])
comp_error_df

In [None]:
sum(comp_error_df['Sq.Error'])

In [None]:
# our aim is to bring this sum of squared errors to as low value as possible.
# This is obtained by applying a regression model
from sklearn.linear_model import LinearRegression

In [None]:
comp_model = LinearRegression()
comp_model.fit(X = comp_df.loc[:,["Units"]], y= comp_df.loc[:,["Minutes"]])

In [None]:
comp_model.intercept_

In [None]:
comp_model.coef_

In [None]:
y_pred = comp_model.intercept_+comp_model.coef_[0,0]*(comp_df['Units'])
y_pred

In [None]:
comp_df_final = pd.DataFrame(np.array([comp_df.Units,
                                      comp_df.Minutes,
                                      y_pred,
                                      comp_df.Minutes - y_pred,
                                      (comp_df.Minutes - y_pred)**2]).T,
                            columns = ["Units", "Actual Time", "Predicted Time", "Error", "Sq.Error"])
comp_df_final                                     

In [None]:
sum(comp_df_final['Sq.Error'])

In [None]:
fig,ax = plt.subplots()
ax.scatter(x="Units",y="Minutes",data=comp_df)
ax.add_line(plt.Line2D(comp_df.Units,comp_df.MeanTime,color="red"))
ax.add_line(plt.Line2D(comp_df.Units,comp_df_final['Predicted Time'],color="black"))
plt.show()

In [None]:
# Validating our model
import statsmodels.api as sm
X = sm.add_constant(comp_df[["Units"]])
y = comp_df["Minutes"]
model = sm.OLS(y,X).fit()

In [None]:
model.summary()

In [None]:
comp_model.score(X = np.array(comp_df['Units']).reshape(-1,1), y = comp_df['Minutes'])

In [None]:
# You can observe that the model score is same as r-squared value
# Adjusted r-square is more accurate in case of multi linear regression
# any r-squared or score will tell you how accurate your model will be. in the given example,
# it can be inferred from the score that the value predicted 
# will be as close as 98.7% to the actual value