In [9]:
# Import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Load the csv dataset to the notebook
file_path = "laptop_pricing_dataset_mod1.csv"
df = pd.read_csv(file_path, header = None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,,Manufacturer,Category,Screen,GPU,OS,CPU_core,Screen_Size_cm,CPU_frequency,RAM_GB,Storage_GB_SSD,Weight_kg,Price
1,0.0,Acer,4,IPS Panel,2,1,5,35.56,1.6,8,256,1.6,978
2,1.0,Dell,3,Full HD,1,1,3,39.624,2.0,4,256,2.2,634
3,2.0,Dell,3,Full HD,1,1,7,39.624,2.7,8,256,2.2,946
4,3.0,Dell,4,IPS Panel,2,1,5,33.782,1.6,8,128,1.22,1244


In [11]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239 entries, 0 to 238
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       238 non-null    float64
 1   1       239 non-null    object 
 2   2       239 non-null    object 
 3   3       239 non-null    object 
 4   4       239 non-null    object 
 5   5       239 non-null    object 
 6   6       239 non-null    object 
 7   7       235 non-null    object 
 8   8       239 non-null    object 
 9   9       239 non-null    object 
 10  10      239 non-null    object 
 11  11      234 non-null    object 
 12  12      239 non-null    object 
dtypes: float64(1), object(12)
memory usage: 24.4+ KB
None


In [13]:
df[['Screen_Size_cm']] = np.round(df[['Screen_Size_cm']],2)
df.head(20)

KeyError: "None of [Index(['Screen_Size_cm'], dtype='object')] are in the [columns]"

In [None]:
missing_data = df.isnull()
missing_data.head()
for column in missing_data.columns.values.tolist():
    print(column)
    print(missing_data[column].value_counts())
    print(" ")

In [None]:
mean_weight = df["Weight_kg"].astype('float').mean(axis=0)
df["Weight_kg"].replace(np.nan, mean_weight, inplace = True)

In [None]:
frequent_screen_size_cm = df["Screen_Size_cm"].value_counts().idxmax()
# print(frequent_screen_size_cm)
df["Screen_Size_cm"].replace(np.nan, frequent_screen_size_cm, inplace = True)

In [None]:
# Double square brackets are used because they return a series and since we're updating two columns(2D array) it is necessary
df[["Weight_kg", "Screen_Size_cm"]] = df[["Weight_kg", "Screen_Size_cm"]].astype('float')

In [None]:
# Data standardization: convert weight from kg to pounds
df["Weight_kg"] = 2.205*df["Weight_kg"]
df.rename(columns={"Weight_kg" : "Weight_pounds"}, inplace = True)

# Data standardization: convert screen size from cm to inch
df["Screen_Size_cm"] = df["Screen_Size_cm"]/2.54
df.rename(columns={"Screen_Size_cm" : "Screen_Size_inch"}, inplace = True)

In [None]:
# Normalization
df["CPU_frequency"] = df["CPU_frequency"]/df["CPU_frequency"].max()

In [None]:
# Binning
bins = np.linspace(min(df["Price"]), max(df["Price"]), 4)
group_names =  ["Low", "Medium", "High"]
df["Price_binned"] = pd.cut(df["Price"], bins, labels=group_names, include_lowest = True)
df["Price_binned"].value_counts()

In [None]:

bar = plt.bar(group_names, df['Price_binned'].value_counts())
plt.xlabel('price')
plt.ylabel("frequency")
plt.title("Price bar graph")

In [None]:
Screen_dummy = pd.get_dummies(df["Screen"])
Screen_dummy.rename(columns={'IPS Panel' : 'Screen-IPS_panel', 'Full HD' : 'Screen-Full_HD'}, inplace = True)
# Screen_dummy.head()
df = pd.concat([df, Screen_dummy], axis = 1)
df.drop("Screen", axis = 1, inplace = True)
df.head()

In [None]:
df.to_csv()

In [None]:
print(df.head())