<a href="https://colab.research.google.com/github/JQ100/project1/blob/main/cd_project1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
filename = '/content/sales_predictions.csv'
df = pd.read_csv(filename)


In [3]:
# Q1
df.shape[0] # 8523 row count

8523

In [4]:
df.shape[1] # 12 column count

12

In [5]:
# Q2
# item_identifier -> str
# item_weight -> float
# item_fat_content -> str
# item_visibility -> float
# item_type -> str
# item_mrp -> float
# outlet_identifier -> str
# outlet_establishment -> int
# outlet_size -> str
# outlet_location_type -> str
# outlet_type -> str
# item_outlet_sales -> float

In [6]:
# Q3 - there are no duplicates in this dataset
df.duplicated().sum() 

0

In [7]:
# Q4 - there are 1463 missing values for Item_Weight and 2410 missing values for Outlet_Location_Type
df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [8]:
# Q5 - Since all the provided values are essential to determining sales, we are not dropping any rows/columns.
# Also, since sales numbers have to be precise, it would be risky to use a number like mean or mode to fill in missing values.
# We are using -1 to mark the info missing while not factoring them into our calculations.

not_missing_df = df['Item_Weight'].dropna() # needed for a later question
df['Item_Weight'].fillna(-1, inplace=True)
df['Outlet_Size'].fillna(-1, inplace=True)

In [9]:
# Q6 - There are no more missing values.
df.isna().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [10]:

# Q7 - fix inconsistencies, mostly textual content
# begin with fat content column
df['Item_Fat_Content'].unique()
df['Item_Fat_Content'] = df['Item_Fat_Content'].str.lower()
df['Item_Fat_Content'].replace(to_replace = 'lf', inplace=True, value = 'low fat') # replaces all lf with low fat
df['Item_Fat_Content'].replace(to_replace = 'reg', inplace=True, value = 'regular') # replaces all reg with regular
df['Item_Fat_Content'].unique()

array(['low fat', 'regular'], dtype=object)

In [11]:
# next check item type column data consistency; no changes necessary
df['Item_Type'].unique()

array(['Dairy', 'Soft Drinks', 'Meat', 'Fruits and Vegetables',
       'Household', 'Baking Goods', 'Snack Foods', 'Frozen Foods',
       'Breakfast', 'Health and Hygiene', 'Hard Drinks', 'Canned',
       'Breads', 'Starchy Foods', 'Others', 'Seafood'], dtype=object)

In [12]:
# next check outlet location type column data consistency; no changes necessary
df['Outlet_Location_Type'].unique()

array(['Tier 1', 'Tier 3', 'Tier 2'], dtype=object)

In [13]:
# finally check outlet type column data consistency; no changes necessary
df['Outlet_Type'].unique()

array(['Supermarket Type1', 'Supermarket Type2', 'Grocery Store',
       'Supermarket Type3'], dtype=object)

In [15]:
# Q8 - the numerical columns are Item_Weight, Item_Visibility, Item_MRP, Outlet_Establishment, and Item_Outlet_Sales
# item weight
not_missing_df.min() # 4.555

4.555

In [16]:
not_missing_df.max() # 21.35

21.35

In [17]:
not_missing_df.mean() # 12.858

12.857645184136183

In [18]:
# item visibility
df['Item_Visibility'].min() # 0

0.0

In [19]:
df['Item_Visibility'].max() # 0.328

0.328390948

In [20]:
df['Item_Visibility'].mean() # 0.066

0.06613202877895127

In [21]:
# item mrp
df['Item_MRP'].min() # 31.29

31.29

In [22]:
df['Item_MRP'].max() # 266.888

266.8884

In [23]:
df['Item_MRP'].mean() # 140.993

140.9927819781768

In [24]:
# outlet establishment
df['Outlet_Establishment_Year'].min() # 1985

1985

In [25]:
df['Outlet_Establishment_Year'].max() # 2009

2009

In [26]:
df['Outlet_Establishment_Year'].mean() # 1998

1997.8318667135984

In [27]:
# item outlet sales
df['Item_Outlet_Sales'].min() # 33.29


33.29

In [28]:
df['Item_Outlet_Sales'].max() # 13086.965

13086.9648

In [29]:
df['Item_Outlet_Sales'].mean() # 2181.289

2181.2889135750365