# Task 1: Data Cleaning and Preprocessing
<br>
Description: Work with a raw dataset (e.g., CSV file) that contains missing values, duplicates, and inconsistent data formats.


In [9]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [10]:
df = pd.read_csv("Stock.csv")
df

Unnamed: 0,symbol,date,open,high,low,close,volume
0,AAL,2014-01-02,25.0700,25.8200,25.0600,25.3600,8998943
1,AAPL,2014-01-02,79.3828,79.5756,78.8601,79.0185,58791957
2,AAP,2014-01-02,110.3600,111.8800,109.2900,109.7400,542711
3,ABBV,2014-01-02,52.1200,52.3300,51.5200,51.9800,4569061
4,ABC,2014-01-02,70.1100,70.2300,69.4800,69.8900,1148391
...,...,...,...,...,...,...,...
497467,XYL,2017-12-29,68.5300,68.8000,67.9200,68.2000,1046677
497468,YUM,2017-12-29,82.6400,82.7100,81.5900,81.6100,1347613
497469,ZBH,2017-12-29,121.7500,121.9500,120.6200,120.6700,1023624
497470,ZION,2017-12-29,51.2800,51.5500,50.8100,50.8300,1261916


# **Working with inconsistent type of data**

Identifying Inconsistent data types

In [11]:
df.dtypes

symbol     object
date       object
open      float64
high      float64
low       float64
close     float64
volume      int64
dtype: object

# **Identifying Object type data**

In [12]:
for i in df.columns:
    if(df[i].dtype == "object"):
        print(i)

symbol
date


Label Encoding Object Type Data 
Coverting Categorical ---> Numerical data

In [13]:
for i in df.columns:
    if(df[i].dtype == 'object' ):
        df[i] = LabelEncoder().fit_transform(df[i])

In [14]:
df.dtypes

symbol      int32
date        int32
open      float64
high      float64
low       float64
close     float64
volume      int64
dtype: object

Adjusting Data types to be consistent 
Converting all the data into type-->Float32

In [15]:
df = df.astype('float32')

# **Handling Missing values**

Identifying Missing values

In [16]:
# Identifying Missing values
nullpoints = pd.DataFrame(df.isnull().sum() )
nullpoints

Unnamed: 0,0
symbol,0
date,0
open,11
high,8
low,8
close,0
volume,0


Replacing the missing values with mode of the data 

In [17]:
for i in df.columns:
    if(df[i].isnull().sum() != 0):
        df[i] = df[i].mode()[0]

In [18]:
df.isnull().sum()

symbol    0
date      0
open      0
high      0
low       0
close     0
volume    0
dtype: int64

# **Standizing Data**

Arrananging all the data to fall under same range
<br>
Making each column to be between 0 to 1

In [19]:
for i in df.columns:
    df[i] = StandardScaler().fit_transform(df[[i]])

In [None]:
df.head()

Unnamed: 0,symbol,date,open,high,low,close,volume
0,-1.727913,-1.743806,0.0,0.0,0.0,-0.601239,0.57644
1,-1.714198,-1.743806,0.0,0.0,0.0,-0.072439,6.625058
2,-1.721055,-1.743806,0.0,0.0,0.0,0.230318,-0.450782
3,-1.707341,-1.743806,0.0,0.0,0.0,-0.338901,0.038319
4,-1.700483,-1.743806,0.0,0.0,0.0,-0.1624,-0.377207


: 