# Data Cleaning

In [1]:
# importing libraries
import pandas as pd
import numpy as np
from scipy.stats import zscore

In [2]:
# bringing in data 
data = pd.read_csv("final_stocks_data.csv")
data.head(20)

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Adj Close,Volume
0,2022-01-03 00:00:00+00:00,AAPL,177.830002,182.880005,177.710007,182.009995,179.273605,104487900.0
1,2022-01-03 00:00:00+00:00,XOM,61.240002,63.599998,61.209999,63.540001,57.618004,24282400.0
2,2022-01-03 00:00:00+00:00,VZ,52.07,52.560001,51.98,52.439999,43.423004,18240100.0
3,2022-01-03 00:00:00+00:00,V,217.520004,222.059998,217.009995,221.429993,216.793564,7694500.0
4,2022-01-03 00:00:00+00:00,TSLA,382.583344,400.356659,378.679993,399.926666,399.926666,103931400.0
5,2022-01-03 00:00:00+00:00,T,18.716011,19.320242,18.716011,19.206949,15.77299,76970210.0
6,2022-01-03 00:00:00+00:00,PG,161.690002,162.940002,159.880005,162.899994,151.253021,9317300.0
7,2022-01-03 00:00:00+00:00,PFE,58.5,58.549999,56.34,56.650002,50.231384,57219200.0
8,2022-01-03 00:00:00+00:00,PEP,172.199997,173.190002,170.570007,172.979996,160.040619,5488900.0
9,2022-01-03 00:00:00+00:00,NKE,167.529999,167.910004,163.309998,164.669998,158.93515,5670500.0


In [3]:
# Checking for nan
data.isna().sum()

Date         0
Symbol       0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [4]:
data.isnull().sum()

Date         0
Symbol       0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [5]:
data.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,15030.0,15030.0,15030.0,15030.0,15030.0,15030.0
mean,151.041353,152.886914,149.192466,151.074372,147.67789,38271130.0
std,116.481232,118.02374,114.93429,116.514823,116.721466,95672320.0
min,10.971,11.735,10.813,11.227,11.216744,942700.0
25%,60.317501,60.869999,59.925,60.415,57.215084,5513700.0
50%,124.389999,125.935001,122.814999,124.445,118.864254,12124250.0
75%,186.334999,188.622505,184.284996,186.5625,184.053253,29056500.0
max,630.409973,633.890015,623.0,633.659973,633.659973,1543911000.0


In [6]:
data.columns

Index(['Date', 'Symbol', 'Open', 'High', 'Low', 'Close', 'Adj Close',
       'Volume'],
      dtype='object')

In [7]:
# removing for outliers
def remove_outliers_zscore(data, numerical_cols, threshold=3):
    """
    Removes outliers from the given DataFrame using the Z-score method.

    Parameters:
    -----------
    data : pd.DataFrame
        The input DataFrame containing the data.
    numerical_cols : list
        List of numerical columns to check for outliers.
    threshold : float, optional (default=3)
        The Z-score threshold to use for outlier detection.

    Returns:
    --------
    pd.DataFrame
        A new DataFrame with outliers removed.
    """
    # Calculate Z-scores for the numerical columns
    z_scores = data[numerical_cols].apply(zscore)

    # Create a mask for non-outliers (Z-scores within the threshold)
    mask = (np.abs(z_scores) < threshold).all(axis=1)

    # Return the filtered DataFrame without outliers
    return data[mask]

# calling the function
numerical_cols = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
data_cleaned = remove_outliers_zscore(data, numerical_cols)

print(f"Original data shape: {data.shape}")
print(f"Cleaned data shape: {data_cleaned.shape}")

# Optional: Save the cleaned data to a CSV




Original data shape: (15030, 8)
Cleaned data shape: (14293, 8)


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15030 entries, 0 to 15029
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       15030 non-null  object 
 1   Symbol     15030 non-null  object 
 2   Open       15030 non-null  float64
 3   High       15030 non-null  float64
 4   Low        15030 non-null  float64
 5   Close      15030 non-null  float64
 6   Adj Close  15030 non-null  float64
 7   Volume     15030 non-null  float64
dtypes: float64(6), object(2)
memory usage: 939.5+ KB


In [9]:
# changing columns to correct data types
data["Date"] = pd.to_datetime(data["Date"])

In [13]:
# removing space and switching to lowercase
data.columns = data.columns.str.strip().str.lower().str.replace(' ', '_')


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15030 entries, 0 to 15029
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype              
---  ------     --------------  -----              
 0   date       15030 non-null  datetime64[ns, UTC]
 1   symbol     15030 non-null  object             
 2   open       15030 non-null  float64            
 3   high       15030 non-null  float64            
 4   low        15030 non-null  float64            
 5   close      15030 non-null  float64            
 6   adj_close  15030 non-null  float64            
 7   volume     15030 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(6), object(1)
memory usage: 939.5+ KB


In [12]:
# checking for duplicates 
duplicates = data.duplicated().sum()

0

In [None]:
# Save to a new CSV file
# data.to_csv('cleaned_yahoo_finance_data.csv', index=True)


In [16]:
# Set Date as the index
# data = data.set_index('date')

# # Check for missing dates (business days)
# all_dates = pd.date_range(start=data.index.min(), end=data.index.max(), freq='B')
# missing_dates = all_dates.difference(data.index)

# print(f"Missing dates: {len(missing_dates)}")


Missing dates: 19


In [None]:
# # Reindex the DataFrame to include the missing dates
# data = data.reindex(all_dates)
