<a href="https://colab.research.google.com/github/Krzesimir13/DataQuest/blob/main/laptops.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Preparation of the environment


*   Import of dataset from Google Drive
*   Import of pandas and numpy
*   Creating a DataFrame object from the dataset and its copy to ongoing work

In [1]:
import pandas as pd
import numpy as np


from google.colab import drive
drive.mount('/content/drive', force_remount=True)

laptops_raw = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/laptops.csv",
                          encoding='latin-1')
laptops = laptops_raw.copy()

Mounted at /content/drive


In [2]:
laptops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Manufacturer              1303 non-null   object
 1   Model Name                1303 non-null   object
 2   Category                  1303 non-null   object
 3   Screen Size               1303 non-null   object
 4   Screen                    1303 non-null   object
 5   CPU                       1303 non-null   object
 6   RAM                       1303 non-null   object
 7    Storage                  1303 non-null   object
 8   GPU                       1303 non-null   object
 9   Operating System          1303 non-null   object
 10  Operating System Version  1133 non-null   object
 11  Weight                    1303 non-null   object
 12  Price (Euros)             1303 non-null   object
dtypes: object(13)
memory usage: 132.5+ KB


In [3]:
def clean_col(col):
  col = col.strip()
  col = col.replace('Operating System', 'os')
  col = col.replace('(', '').replace(')', '').replace(' ', '_').lower()
  return col

In [4]:
new_columns = [clean_col(col) for col in laptops.columns]
laptops.columns = new_columns
laptops.columns

Index(['manufacturer', 'model_name', 'category', 'screen_size', 'screen',
       'cpu', 'ram', 'storage', 'gpu', 'os', 'os_version', 'weight',
       'price_euros'],
      dtype='object')

In [5]:
unique_ram = laptops['ram'].unique()
print(unique_ram)

['8GB' '16GB' '4GB' '2GB' '12GB' '6GB' '32GB' '24GB' '64GB']


In [6]:
laptops['ram'] = laptops['ram'].str.replace('GB', '').astype(int)
laptops['ram'].unique()

array([ 8, 16,  4,  2, 12,  6, 32, 24, 64])

In [7]:
laptops.rename({'ram' : 'ram_gb'}, axis=1, inplace=True)
laptops.columns
ram_gb_desc = laptops['ram_gb'].describe()
ram_gb_desc

Unnamed: 0,ram_gb
count,1303.0
mean,8.382195
std,5.084665
min,2.0
25%,4.0
50%,8.0
75%,8.0
max,64.0


In [8]:
gpu_split = laptops['gpu'].str.split()
laptops['gpu_manufacturer'] = gpu_split.str[0]
gpu_manufacturer_counts = laptops['gpu_manufacturer'].value_counts()
cpu_split = laptops['cpu'].str.split()
laptops['cpu_manufacturer'] = cpu_split.str[0]
cpu_manufacturer_counts = laptops['cpu_manufacturer'].value_counts()
print(gpu_manufacturer_counts)
print(cpu_manufacturer_counts)


gpu_manufacturer
Intel     722
Nvidia    400
AMD       180
ARM         1
Name: count, dtype: int64
cpu_manufacturer
Intel      1240
AMD          62
Samsung       1
Name: count, dtype: int64


In [9]:
laptops['os'].unique()

array(['macOS', 'No OS', 'Windows', 'Mac OS', 'Linux', 'Android',
       'Chrome OS'], dtype=object)

In [10]:
mapping_dict = {'Mac OS' : 'macOS'}
laptops['os'] = laptops['os'].replace('Mac OS', 'macOS')
laptops['os'].value_counts()


Unnamed: 0_level_0,count
os,Unnamed: 1_level_1
Windows,1125
No OS,66
Linux,62
Chrome OS,27
macOS,21
Android,2


In [11]:
laptops_no_null_rows = laptops.dropna(axis=0)
laptops_no_null_cols = laptops.dropna(axis=1)
print(laptops_no_null_rows.shape)
print(laptops_no_null_cols.shape)

(1133, 15)
(1303, 14)


In [12]:
value_counts_before = laptops.loc[laptops["os_version"].isnull(), "os"].value_counts()
value_counts_before

Unnamed: 0_level_0,count
os,Unnamed: 1_level_1
No OS,66
Linux,62
Chrome OS,27
macOS,13
Android,2


In [13]:
laptops.loc[laptops["os"] == "macOS", "os_version"] = "X"
laptops.loc[laptops['os'] == 'No OS', 'os_version'] = 'Not Applicable'
value_counts_after = laptops.loc[laptops["os_version"].isnull(), "os"].value_counts()

In [14]:
laptops.columns

Index(['manufacturer', 'model_name', 'category', 'screen_size', 'screen',
       'cpu', 'ram_gb', 'storage', 'gpu', 'os', 'os_version', 'weight',
       'price_euros', 'gpu_manufacturer', 'cpu_manufacturer'],
      dtype='object')

In [15]:
laptops.rename({'weight' : 'weight_kg'}, axis=1, inplace=True)
laptops.columns

Index(['manufacturer', 'model_name', 'category', 'screen_size', 'screen',
       'cpu', 'ram_gb', 'storage', 'gpu', 'os', 'os_version', 'weight_kg',
       'price_euros', 'gpu_manufacturer', 'cpu_manufacturer'],
      dtype='object')

In [17]:
import re


def clean_weight(weight):
  pattern = r'^[0-9]*\.?[0-9]+'
  result = re.match(pattern, weight)
  return float(result.group())

laptops['weight_kg'] = [clean_weight(weight) for weight in laptops['weight_kg']]
laptops['weight_kg'].head()

Unnamed: 0,weight_kg
0,1.37
1,1.34
2,1.86
3,1.83
4,1.37


In [19]:
laptops.to_csv('laptops_cleaned.csv', index=False)