In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/samplebigmartdata/big_mart_sales_top_row_error.csv


In [2]:
"""
How to impute the missing values using loc in any column

"""

import pandas as pd

sample_df = pd.read_csv('../input/samplebigmartdata/big_mart_sales_top_row_error.csv', skiprows = 5)

#Check the columns with null values. First, we will check the number of missing values in each of the column

sample_df.isna().sum()




Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [3]:
"""
Imputing missing values

1. Using loc function

2. Using fillna()

"""

'\nImputing missing values\n\n1. Using loc function\n\n2. Using fillna()\n\n'

In [4]:
print(sample_df.dtypes)

# Using loc function to fill the null values of Item_Weight with mean

sample_df.loc[(sample_df.Item_Weight.isna() == True), 'Item_Weight'] = sample_df.Item_Weight.mean()

sample_df.isna().sum()

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object


Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [5]:
# Using fillna()-- This is another way to impute the missing values. Use the parameter inplace = True to store the results in the dataframe

# Outlet_size is a categorical variable-- replace it with mode

print(sample_df.Outlet_Size.mode())

sample_df.Outlet_Size.fillna('Medium', inplace = True)

sample_df.isna().sum()


0    Medium
dtype: object


Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [6]:
# Check the different values populated for Item_Fat_Content column

sample_df.Item_Fat_Content.value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [7]:
"""
We can see that the categories Low Fat, LF and low fat are same and also Regular, reg are same. In order to keep the data clean we will map all

these to only two categories - LF and R using the map function

"""

# Create a new mapping dictionary

mapping_dict = { 
                'Low Fat' : 'LF',
                'Regular' : 'R' ,
                'LF'      : 'LF',
                'reg'     : 'R' ,
                'low fat' : 'LF'



                }


# Use the map fulction to update the values

sample_df.Item_Fat_Content = sample_df.Item_Fat_Content.map(mapping_dict)

sample_df.Item_Fat_Content.value_counts()

LF    5517
R     3006
Name: Item_Fat_Content, dtype: int64

In [8]:
"""
Creating a new column by modifying the existing column

"""

# Creating a new column Item_MRP_in_USD by dividing the each value in the column Item_MRP by 74 using the apply function

sample_df['Item_MRP_in_USD'] = sample_df.Item_MRP.apply(lambda x: x/74)

sample_df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_MRP_in_USD
0,FDA15,9.300,LF,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380,3.375800
1,DRC01,5.920,R,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,0.652286
2,FDN15,17.500,LF,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700,1.913757
3,FDX07,19.200,R,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,Medium,Tier 3,Grocery Store,732.3800,2.460743
4,NCD19,8.930,LF,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,0.727857
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,LF,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834,2.898943
8519,FDS36,8.380,R,0.046982,Baking Goods,108.1570,OUT045,2002,Medium,Tier 2,Supermarket Type1,549.2850,1.461581
8520,NCJ29,10.600,LF,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136,1.150303
8521,FDN46,7.210,R,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976,1.393692
