# Required python packages

In [1]:
import pandas as pd

# Data import

In [2]:
data = pd.read_csv("data_v1.csv")

# Data sneekpeek

In [3]:
data.head()

Unnamed: 0,S.No.,Name,Mar Cap - Crore,Sales Qtr - Crore,Unnamed: 4
0,1,Reliance Inds.,583436.72,99810.0,
1,2,TCS,563709.84,30904.0,
2,3,HDFC Bank,482953.59,20581.27,
3,4,ITC,320985.27,9772.02,
4,5,H D F C,289497.37,16840.51,


#### There is an irrelevant column "Unnamed:4". We will drop it

In [4]:
data = data.drop(columns=["Unnamed: 4"])

# Check for duplicates

#### There are no duplicate entries

In [5]:
data.duplicated().sum()

0

# Check for NaN

In [6]:
data.isna().sum()

S.No.                  0
Name                   0
Mar Cap - Crore        9
Sales Qtr - Crore    123
dtype: int64

#### There are 9 NaN in column "Mar Cap - Crore" and 123 NaN in "Sales Qtr - Crore". Lets find what percentage of these NaN are

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 488 entries, 0 to 487
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   S.No.              488 non-null    int64  
 1   Name               488 non-null    object 
 2   Mar Cap - Crore    479 non-null    float64
 3   Sales Qtr - Crore  365 non-null    float64
dtypes: float64(2), int64(1), object(1)
memory usage: 15.4+ KB


#### There are total 488 entries in each columns. Among them, 479 are non-null in "Mar Cap - Crore" and 365 are non-null in "Sales Qtr - Crore". So there are around 1.8% and 25% are NaN. So we can not ignore the NaN values in "Sales Qtr - Crore"

In [8]:
data = data.dropna(subset=[data.columns[2]])

# Rechecking for NaN

In [9]:
data.isna().sum(), data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 479 entries, 0 to 486
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   S.No.              479 non-null    int64  
 1   Name               479 non-null    object 
 2   Mar Cap - Crore    479 non-null    float64
 3   Sales Qtr - Crore  365 non-null    float64
dtypes: float64(2), int64(1), object(1)
memory usage: 18.7+ KB


(S.No.                  0
 Name                   0
 Mar Cap - Crore        0
 Sales Qtr - Crore    114
 dtype: int64,
 None)

# Values in the data sheet are in Cr. We will transform them into rupee and let Tableau handle later

In [10]:
row, col = data.shape
row, col

(479, 4)

In [11]:
for i in range(row):
    data.iloc[i, 2] = data.iloc[i, 2]*10**7
    data.iloc[i, 3] = data.iloc[i, 3]*10**7

In [12]:
data.head()

Unnamed: 0,S.No.,Name,Mar Cap - Crore,Sales Qtr - Crore
0,1,Reliance Inds.,5834367000000.0,998100000000.0
1,2,TCS,5637098000000.0,309040000000.0
2,3,HDFC Bank,4829536000000.0,205812700000.0
3,4,ITC,3209853000000.0,97720200000.0
4,5,H D F C,2894974000000.0,168405100000.0


# Save the preprocess data into new file

In [13]:
data.to_csv("data_v2.csv", index = False)