In [2]:
# importing librarys
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# reading the dataset and storing them
df_cities = pd.read_csv("../Dataset/dim_cities.csv")
df_respondents = pd.read_csv("../Dataset/dim_repondents.csv")
df_responses = pd.read_csv("../Dataset/fact_survey_responses.csv")

#### Cleaning cities dataset

In [8]:
df_cities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   City_ID  10 non-null     object
 1   City     10 non-null     object
 2   Tier     10 non-null     object
dtypes: object(3)
memory usage: 368.0+ bytes


In [9]:
df_cities.describe()

Unnamed: 0,City_ID,City,Tier
count,10,10,10
unique,10,10,2
top,CT111,Delhi,Tier 1
freq,1,1,5


In [11]:
df_cities.isnull().sum()

City_ID    0
City       0
Tier       0
dtype: int64

In [13]:
df_cities.duplicated().any()

False

We can conclude there are no null values in cities dataset or any duplicated value

#### Cleaning respondents dataframe

In [7]:
df_respondents.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Respondent_ID  10000 non-null  int64 
 1   Name           10000 non-null  object
 2   Age            10000 non-null  object
 3   Gender         10000 non-null  object
 4   City_ID        10000 non-null  object
dtypes: int64(1), object(4)
memory usage: 390.8+ KB


In [14]:
df_respondents.describe()

Unnamed: 0,Respondent_ID
count,10000.0
mean,125030.5
std,2886.89568
min,120031.0
25%,122530.75
50%,125030.5
75%,127530.25
max,130030.0


In [15]:
df_respondents.isnull().sum()

Respondent_ID    0
Name             0
Age              0
Gender           0
City_ID          0
dtype: int64

In [16]:
df_respondents.duplicated().any()

False

In [4]:
# creating a new column name age_group which has value ('young adult', 'middle adult', 'senior')
conditions = [
    df_respondents['Age'].isin(['15-18', '19-30']),
    df_respondents['Age'].isin(['31-45', '46-65']),
    df_respondents['Age'] == '65+'
]
choices = ['Young Adult', 'Middle Adult', 'Senior']

# Create the 'Age_Group' column
df_respondents['Age_Group'] = np.select(conditions, choices, default='Other')

In [None]:
df_respondents.to_csv('../Dataset/dim_repondents_updated.csv', index=False)

We can conclude there are no null values in cities dataset or any duplicated value

#### Cleaning responses dataframe

In [17]:
df_responses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   Response_ID                     10000 non-null  int64 
 1   Respondent_ID                   10000 non-null  int64 
 2   Consume_frequency               10000 non-null  object
 3   Consume_time                    10000 non-null  object
 4   Consume_reason                  10000 non-null  object
 5   Heard_before                    10000 non-null  object
 6   Brand_perception                10000 non-null  object
 7   General_perception              10000 non-null  object
 8   Tried_before                    10000 non-null  object
 9   Taste_experience                10000 non-null  int64 
 10  Reasons_preventing_trying       10000 non-null  object
 11  Current_brands                  10000 non-null  object
 12  Reasons_for_choosing_brands     10000 non-null 

In [18]:
df_responses.describe()

Unnamed: 0,Response_ID,Respondent_ID,Taste_experience
count,10000.0,10000.0,10000.0
mean,108000.5,125030.5,3.2819
std,2886.89568,2886.89568,1.239752
min,103001.0,120031.0,1.0
25%,105500.75,122530.75,2.0
50%,108000.5,125030.5,3.0
75%,110500.25,127530.25,4.0
max,113000.0,130030.0,5.0


In [19]:
df_responses.isnull().sum()

Response_ID                       0
Respondent_ID                     0
Consume_frequency                 0
Consume_time                      0
Consume_reason                    0
Heard_before                      0
Brand_perception                  0
General_perception                0
Tried_before                      0
Taste_experience                  0
Reasons_preventing_trying         0
Current_brands                    0
Reasons_for_choosing_brands       0
Improvements_desired              0
Ingredients_expected              0
Health_concerns                   0
Interest_in_natural_or_organic    0
Marketing_channels                0
Packaging_preference              0
Limited_edition_packaging         0
Price_range                       0
Purchase_location                 0
Typical_consumption_situations    0
dtype: int64

In [20]:
df_responses.duplicated().any()

False

In [21]:
df_responses.head()

Unnamed: 0,Response_ID,Respondent_ID,Consume_frequency,Consume_time,Consume_reason,Heard_before,Brand_perception,General_perception,Tried_before,Taste_experience,...,Improvements_desired,Ingredients_expected,Health_concerns,Interest_in_natural_or_organic,Marketing_channels,Packaging_preference,Limited_edition_packaging,Price_range,Purchase_location,Typical_consumption_situations
0,103001,120031,2-3 times a week,To stay awake during work/study,Increased energy and focus,Yes,Neutral,Not sure,No,5,...,Reduced sugar content,Guarana,No,Yes,TV commercials,Compact and portable cans,Yes,50-99,Supermarkets,Studying/working late
1,103002,120032,2-3 times a month,Throughout the day,To boost performance,No,Neutral,Not sure,No,5,...,More natural ingredients,Caffeine,Yes,Not Sure,Print media,Compact and portable cans,No,50-99,Supermarkets,Sports/exercise
2,103003,120033,Rarely,Before exercise,Increased energy and focus,No,Neutral,Not sure,No,2,...,More natural ingredients,Caffeine,No,Yes,Online ads,Innovative bottle design,Not Sure,100-150,Supermarkets,Studying/working late
3,103004,120034,2-3 times a week,To stay awake during work/study,To boost performance,No,Positive,Dangerous,Yes,5,...,Other,Caffeine,No,Yes,Online ads,Compact and portable cans,No,Above 150,Supermarkets,Sports/exercise
4,103005,120035,Daily,To stay awake during work/study,Increased energy and focus,Yes,Neutral,Effective,Yes,5,...,More natural ingredients,Caffeine,Yes,Yes,Online ads,Compact and portable cans,Yes,100-150,Online retailers,Studying/working late


We can conclude there are no null values in cities dataset or any duplicated value