In [586]:
import pandas as pd
import numpy as np

In [587]:
df = pd.read_csv('survey_results.csv')

In [588]:
print(df.shape)

(30010, 17)


## Remove Duplicates 

In [589]:
duplicate_rows_count = df.duplicated().sum()

In [590]:
print(f"Total Number of Duplicate Rows Found (excluding first occurrence): {duplicate_rows_count}")

Total Number of Duplicate Rows Found (excluding first occurrence): 10


In [591]:
df_clean = df.drop_duplicates(keep='first')

In [592]:
initial_rows = len(df)
cleaned_rows = len(df_clean)

print(f"Initial Rows: {initial_rows}")
print(f"Cleaned Rows: {cleaned_rows}")
print(f"Rows Removed: {initial_rows - cleaned_rows}")

Initial Rows: 30010
Cleaned Rows: 30000
Rows Removed: 10


## Outlier Detection in Age

In [593]:
Q1 = df_clean["age"].quantile(0.25)

In [594]:
Q3 = df_clean["age"].quantile(0.75)

In [595]:
IQR = Q3 - Q1

In [596]:
upper_bound = Q3 + 1.5 * IQR

In [597]:
lower_bound = Q1 - 1.5 * IQR

In [598]:
print(f"Upper Boundary for Outliers in age: {upper_bound:.2f}")

Upper Boundary for Outliers in age: 65.50


In [599]:
high_outliers = df_clean[df_clean["age"] > upper_bound]["age"]

In [600]:
if high_outliers.empty:
    max_outlier_value = None
    print(f"\nNo high outliers found in age.")
else:
    max_outlier_value = high_outliers.max()
    print(f"\nMaximum Outlier Value in age: {max_outlier_value:.2f}")


Maximum Outlier Value in age: 604.00


In [601]:
df_clean = df_clean[df_clean['age'] < 100]

## Handling Missing Data

In [602]:
print("1. Columns with Null Values :")
print(df_clean.isnull().any())
print("\n" + "="*40 + "\n")

print("2. Count of Null Values per Column:")
print(df_clean.isnull().sum())

1. Columns with Null Values :
respondent_id                     False
age                               False
gender                            False
zone                              False
occupation                        False
income_levels                      True
consume_frequency(weekly)          True
current_brand                     False
preferable_consumption_size       False
awareness_of_other_brands         False
reasons_for_choosing_brands       False
flavor_preference                 False
purchase_channel                   True
packaging_preference              False
health_concerns                   False
typical_consumption_situations    False
price_range                       False
dtype: bool


2. Count of Null Values per Column:
respondent_id                        0
age                                  0
gender                               0
zone                                 0
occupation                           0
income_levels                     8060
consum

In [603]:
column_to_fill = 'consume_frequency(weekly)'

In [604]:
mode_value = df_clean[column_to_fill].mode()[0]

In [605]:
print(f"\nThe Mode value for '{column_to_fill}' is: {mode_value}")


The Mode value for 'consume_frequency(weekly)' is: 3-4 times


In [606]:
df_clean[column_to_fill].fillna(mode_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean[column_to_fill].fillna(mode_value, inplace=True)


In [607]:
print(f"Null values remaining in '{column_to_fill}': {df_clean[column_to_fill].isnull().sum()}")

Null values remaining in 'consume_frequency(weekly)': 0


In [608]:
mode_value = df_clean["purchase_channel"].mode()[0]

In [609]:
print(f"\nThe Mode value for purchase_channel is: {mode_value}")


The Mode value for purchase_channel is: Online


In [610]:
df_clean["purchase_channel"].fillna(mode_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean["purchase_channel"].fillna(mode_value, inplace=True)


In [611]:
print(f"Null values remaining in purchase_channel: {df_clean["purchase_channel"].isnull().sum()}")

Null values remaining in purchase_channel: 0


In [612]:
df_clean["income_levels"].fillna("Not Reported", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean["income_levels"].fillna("Not Reported", inplace=True)


In [613]:
print(f"Null values remaining in income_levels : {df_clean["income_levels"].isnull().sum()}")

Null values remaining in income_levels : 0


In [614]:
print("\nNew Value Counts:")
print(df_clean[target_column].value_counts())


New Value Counts:
income_levels
Not Reported    8060
16L - 25L       5897
10L - 15L       5251
<10L            4661
26L - 35L       3872
> 35L           2250
Name: count, dtype: int64


In [615]:
print("1. Columns with Null Values :")
print(df_clean.isnull().any())
print("\n" + "="*40 + "\n")

print("2. Count of Null Values per Column:")
print(df_clean.isnull().sum())

1. Columns with Null Values :
respondent_id                     False
age                               False
gender                            False
zone                              False
occupation                        False
income_levels                     False
consume_frequency(weekly)         False
current_brand                     False
preferable_consumption_size       False
awareness_of_other_brands         False
reasons_for_choosing_brands       False
flavor_preference                 False
purchase_channel                  False
packaging_preference              False
health_concerns                   False
typical_consumption_situations    False
price_range                       False
dtype: bool


2. Count of Null Values per Column:
respondent_id                     0
age                               0
gender                            0
zone                              0
occupation                        0
income_levels                     0
consume_frequency(weekly

## Correcting Spelling Mistakes

In [616]:
for col in df.select_dtypes('object').columns:
    print(f"{col} = {df_clean[col].unique()}")

respondent_id = ['R00001' 'R00002' 'R00003' ... 'R29998' 'R29999' 'R30000']
gender = ['M' 'F']
zone = ['Urban' 'Metro' 'Rural' 'Semi-Urban' 'Metor' 'urbna']
occupation = ['Working Professional' 'Student' 'Entrepreneur' 'Retired']
income_levels = ['<10L' '> 35L' '16L - 25L' 'Not Reported' '10L - 15L' '26L - 35L']
consume_frequency(weekly) = ['3-4 times' '5-7 times' '0-2 times']
current_brand = ['Newcomer' 'Established' 'newcomer' 'Establishd']
preferable_consumption_size = ['Medium (500 ml)' 'Large (1 L)' 'Small (250 ml)']
awareness_of_other_brands = ['0 to 1' '2 to 4' 'above 4']
reasons_for_choosing_brands = ['Price' 'Quality' 'Availability' 'Brand Reputation']
flavor_preference = ['Traditional' 'Exotic']
purchase_channel = ['Online' 'Retail Store']
packaging_preference = ['Simple' 'Premium' 'Eco-Friendly']
health_concerns = ['Medium (Moderately health-conscious)' 'Low (Not very concerned)'
 'High (Very health-conscious)']
typical_consumption_situations = ['Active (eg. Sports, gym)' 'S

In [617]:
df_clean["zone"] = df_clean["zone"].replace("urbna", "Urban")
df_clean["zone"] = df_clean["zone"].replace("Metor", "Metro")
df_clean["current_brand"] = df_clean["current_brand"].replace("newcomer", "Newcomer")
df_clean["current_brand"] = df_clean["current_brand"].replace("Establishd", "Established")


In [618]:
df_clean["current_brand"].unique()

array(['Newcomer', 'Established'], dtype=object)

In [619]:
df_clean["zone"].unique()

array(['Urban', 'Metro', 'Rural', 'Semi-Urban'], dtype=object)

In [620]:
print(df_clean.shape)

(29991, 17)


# Feature Engineering

### Categorize Age into Age Groups

In [621]:
def categorize_age_group(age):
    if 18 <= age <= 25:
        return '18-25'
    elif 26 <= age <= 35:
        return '26-35'
    elif 36 <= age <= 45:
        return '36-45'
    elif 46 <= age <= 55:
        return '46-55'
    elif 56 <= age <= 70:
        return '56-70'
    elif age > 70:
        return '70+'
    else:
        return None


In [622]:
df_clean['age_group'] = df_clean['age'].apply(categorize_age_group)
df_clean.drop('age', axis=1, inplace=True)
df_clean.head()

Unnamed: 0,respondent_id,gender,zone,occupation,income_levels,consume_frequency(weekly),current_brand,preferable_consumption_size,awareness_of_other_brands,reasons_for_choosing_brands,flavor_preference,purchase_channel,packaging_preference,health_concerns,typical_consumption_situations,price_range,age_group
0,R00001,M,Urban,Working Professional,<10L,3-4 times,Newcomer,Medium (500 ml),0 to 1,Price,Traditional,Online,Simple,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",100-150,26-35
1,R00002,F,Metro,Working Professional,> 35L,5-7 times,Established,Medium (500 ml),2 to 4,Quality,Exotic,Retail Store,Premium,Medium (Moderately health-conscious),Social (eg. Parties),200-250,46-55
2,R00003,F,Rural,Working Professional,> 35L,3-4 times,Newcomer,Medium (500 ml),2 to 4,Availability,Traditional,Retail Store,Premium,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",200-250,36-45
3,R00004,F,Urban,Working Professional,16L - 25L,5-7 times,Newcomer,Medium (500 ml),0 to 1,Brand Reputation,Exotic,Online,Eco-Friendly,Low (Not very concerned),"Active (eg. Sports, gym)",150-200,26-35
4,R00005,M,Metro,Student,Not Reported,3-4 times,Established,Medium (500 ml),0 to 1,Availability,Traditional,Online,Premium,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",50-100,18-25


###  Create `cf_ab_score` 

In [623]:
df_clean["consume_frequency(weekly)"].unique()

array(['3-4 times', '5-7 times', '0-2 times'], dtype=object)

In [624]:
df_clean["awareness_of_other_brands"].unique()

array(['0 to 1', '2 to 4', 'above 4'], dtype=object)

In [625]:
frequency_mapping = {
    "0-2 times": 1,
    "3-4 times": 2,
    "5-7 times": 3
}
awareness_mapping = {
    "0 to 1": 1,
    "2 to 4": 2,
    "above 4": 3
}
df_clean["consume_frequency(weekly)"] = df_clean["consume_frequency(weekly)"].map(frequency_mapping)
df_clean["awareness_of_other_brands"] = df_clean["awareness_of_other_brands"].map(awareness_mapping)
df_clean.head()

Unnamed: 0,respondent_id,gender,zone,occupation,income_levels,consume_frequency(weekly),current_brand,preferable_consumption_size,awareness_of_other_brands,reasons_for_choosing_brands,flavor_preference,purchase_channel,packaging_preference,health_concerns,typical_consumption_situations,price_range,age_group
0,R00001,M,Urban,Working Professional,<10L,2,Newcomer,Medium (500 ml),1,Price,Traditional,Online,Simple,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",100-150,26-35
1,R00002,F,Metro,Working Professional,> 35L,3,Established,Medium (500 ml),2,Quality,Exotic,Retail Store,Premium,Medium (Moderately health-conscious),Social (eg. Parties),200-250,46-55
2,R00003,F,Rural,Working Professional,> 35L,2,Newcomer,Medium (500 ml),2,Availability,Traditional,Retail Store,Premium,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",200-250,36-45
3,R00004,F,Urban,Working Professional,16L - 25L,3,Newcomer,Medium (500 ml),1,Brand Reputation,Exotic,Online,Eco-Friendly,Low (Not very concerned),"Active (eg. Sports, gym)",150-200,26-35
4,R00005,M,Metro,Student,Not Reported,2,Established,Medium (500 ml),1,Availability,Traditional,Online,Premium,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",50-100,18-25


In [626]:
df_clean['cf_ab_score'] = (df_clean['consume_frequency(weekly)'] / (df_clean['consume_frequency(weekly)'] + df_clean['awareness_of_other_brands'])).round(2)
df_clean.head()

Unnamed: 0,respondent_id,gender,zone,occupation,income_levels,consume_frequency(weekly),current_brand,preferable_consumption_size,awareness_of_other_brands,reasons_for_choosing_brands,flavor_preference,purchase_channel,packaging_preference,health_concerns,typical_consumption_situations,price_range,age_group,cf_ab_score
0,R00001,M,Urban,Working Professional,<10L,2,Newcomer,Medium (500 ml),1,Price,Traditional,Online,Simple,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",100-150,26-35,0.67
1,R00002,F,Metro,Working Professional,> 35L,3,Established,Medium (500 ml),2,Quality,Exotic,Retail Store,Premium,Medium (Moderately health-conscious),Social (eg. Parties),200-250,46-55,0.6
2,R00003,F,Rural,Working Professional,> 35L,2,Newcomer,Medium (500 ml),2,Availability,Traditional,Retail Store,Premium,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",200-250,36-45,0.5
3,R00004,F,Urban,Working Professional,16L - 25L,3,Newcomer,Medium (500 ml),1,Brand Reputation,Exotic,Online,Eco-Friendly,Low (Not very concerned),"Active (eg. Sports, gym)",150-200,26-35,0.75
4,R00005,M,Metro,Student,Not Reported,2,Established,Medium (500 ml),1,Availability,Traditional,Online,Premium,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",50-100,18-25,0.67


In [627]:
df_clean['cf_ab_score'].max()

np.float64(0.75)

In [628]:
print(df_clean.shape)

(29991, 18)


### Create Zone Affluence Score (ZAS)

In [629]:
def strip_income_level(income):
    return income.replace(" ", "")

df_clean["income_levels"] = df_clean["income_levels"].apply(strip_income_level)

In [630]:
df_clean["income_levels"].unique()

array(['<10L', '>35L', '16L-25L', 'NotReported', '10L-15L', '26L-35L'],
      dtype=object)

In [631]:
df_clean.head()

Unnamed: 0,respondent_id,gender,zone,occupation,income_levels,consume_frequency(weekly),current_brand,preferable_consumption_size,awareness_of_other_brands,reasons_for_choosing_brands,flavor_preference,purchase_channel,packaging_preference,health_concerns,typical_consumption_situations,price_range,age_group,cf_ab_score
0,R00001,M,Urban,Working Professional,<10L,2,Newcomer,Medium (500 ml),1,Price,Traditional,Online,Simple,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",100-150,26-35,0.67
1,R00002,F,Metro,Working Professional,>35L,3,Established,Medium (500 ml),2,Quality,Exotic,Retail Store,Premium,Medium (Moderately health-conscious),Social (eg. Parties),200-250,46-55,0.6
2,R00003,F,Rural,Working Professional,>35L,2,Newcomer,Medium (500 ml),2,Availability,Traditional,Retail Store,Premium,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",200-250,36-45,0.5
3,R00004,F,Urban,Working Professional,16L-25L,3,Newcomer,Medium (500 ml),1,Brand Reputation,Exotic,Online,Eco-Friendly,Low (Not very concerned),"Active (eg. Sports, gym)",150-200,26-35,0.75
4,R00005,M,Metro,Student,NotReported,2,Established,Medium (500 ml),1,Availability,Traditional,Online,Premium,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",50-100,18-25,0.67


In [632]:
zone_mapping = {
    "Urban": 3,
    "Metro": 4,
    "Rural": 1,
    "Semi-Urban": 2
}
income_mapping = {
    "<10L": 1,
    "10L-15L": 2,
    "16L-25L": 3,
    "26L-35L": 4,
    ">35L": 5,
    "NotReported": 0
}
df_clean['zone'] = df_clean['zone'].map(zone_mapping)
df_clean['income_levels'] = df_clean['income_levels'].map(income_mapping)

In [633]:
df_clean['zas_score'] = df_clean['zone'] * df_clean['income_levels']

In [634]:
df_clean.head()

Unnamed: 0,respondent_id,gender,zone,occupation,income_levels,consume_frequency(weekly),current_brand,preferable_consumption_size,awareness_of_other_brands,reasons_for_choosing_brands,flavor_preference,purchase_channel,packaging_preference,health_concerns,typical_consumption_situations,price_range,age_group,cf_ab_score,zas_score
0,R00001,M,3,Working Professional,1,2,Newcomer,Medium (500 ml),1,Price,Traditional,Online,Simple,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",100-150,26-35,0.67,3
1,R00002,F,4,Working Professional,5,3,Established,Medium (500 ml),2,Quality,Exotic,Retail Store,Premium,Medium (Moderately health-conscious),Social (eg. Parties),200-250,46-55,0.6,20
2,R00003,F,1,Working Professional,5,2,Newcomer,Medium (500 ml),2,Availability,Traditional,Retail Store,Premium,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",200-250,36-45,0.5,5
3,R00004,F,3,Working Professional,3,3,Newcomer,Medium (500 ml),1,Brand Reputation,Exotic,Online,Eco-Friendly,Low (Not very concerned),"Active (eg. Sports, gym)",150-200,26-35,0.75,9
4,R00005,M,4,Student,0,2,Established,Medium (500 ml),1,Availability,Traditional,Online,Premium,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",50-100,18-25,0.67,0


In [635]:
df_clean['zas_score'].nunique()

14

In [636]:
print(df_clean.shape)

(29991, 19)


### Brand Switching Indicator (BSI)

In [637]:
df_clean['bsi'] = ((df_clean['current_brand'] != 'Established') & (df['reasons_for_choosing_brands'].isin(['Price', 'Quality']))).astype(int)
df_clean.head()

Unnamed: 0,respondent_id,gender,zone,occupation,income_levels,consume_frequency(weekly),current_brand,preferable_consumption_size,awareness_of_other_brands,reasons_for_choosing_brands,flavor_preference,purchase_channel,packaging_preference,health_concerns,typical_consumption_situations,price_range,age_group,cf_ab_score,zas_score,bsi
0,R00001,M,3,Working Professional,1,2,Newcomer,Medium (500 ml),1,Price,Traditional,Online,Simple,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",100-150,26-35,0.67,3,1
1,R00002,F,4,Working Professional,5,3,Established,Medium (500 ml),2,Quality,Exotic,Retail Store,Premium,Medium (Moderately health-conscious),Social (eg. Parties),200-250,46-55,0.6,20,0
2,R00003,F,1,Working Professional,5,2,Newcomer,Medium (500 ml),2,Availability,Traditional,Retail Store,Premium,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",200-250,36-45,0.5,5,0
3,R00004,F,3,Working Professional,3,3,Newcomer,Medium (500 ml),1,Brand Reputation,Exotic,Online,Eco-Friendly,Low (Not very concerned),"Active (eg. Sports, gym)",150-200,26-35,0.75,9,0
4,R00005,M,4,Student,0,2,Established,Medium (500 ml),1,Availability,Traditional,Online,Premium,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",50-100,18-25,0.67,0,0


In [638]:
df_clean['current_brand'].dtype, df_clean['reasons_for_choosing_brands'].dtype

(dtype('O'), dtype('O'))

In [639]:
df_clean['current_brand'].isna().sum(), df_clean['reasons_for_choosing_brands'].isna().sum()

(np.int64(0), np.int64(0))

In [640]:
df_clean['bsi'].isna().sum()

np.int64(0)

In [641]:
df_clean.head()

Unnamed: 0,respondent_id,gender,zone,occupation,income_levels,consume_frequency(weekly),current_brand,preferable_consumption_size,awareness_of_other_brands,reasons_for_choosing_brands,flavor_preference,purchase_channel,packaging_preference,health_concerns,typical_consumption_situations,price_range,age_group,cf_ab_score,zas_score,bsi
0,R00001,M,3,Working Professional,1,2,Newcomer,Medium (500 ml),1,Price,Traditional,Online,Simple,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",100-150,26-35,0.67,3,1
1,R00002,F,4,Working Professional,5,3,Established,Medium (500 ml),2,Quality,Exotic,Retail Store,Premium,Medium (Moderately health-conscious),Social (eg. Parties),200-250,46-55,0.6,20,0
2,R00003,F,1,Working Professional,5,2,Newcomer,Medium (500 ml),2,Availability,Traditional,Retail Store,Premium,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",200-250,36-45,0.5,5,0
3,R00004,F,3,Working Professional,3,3,Newcomer,Medium (500 ml),1,Brand Reputation,Exotic,Online,Eco-Friendly,Low (Not very concerned),"Active (eg. Sports, gym)",150-200,26-35,0.75,9,0
4,R00005,M,4,Student,0,2,Established,Medium (500 ml),1,Availability,Traditional,Online,Premium,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",50-100,18-25,0.67,0,0


### Removing Logical Outliers:

In [642]:
df_clean.groupby('age_group')['occupation'].value_counts()

age_group  occupation          
18-25      Student                 7328
           Working Professional    2605
           Entrepreneur             535
26-35      Working Professional    6570
           Entrepreneur            1826
           Student                  697
36-45      Working Professional    4353
           Entrepreneur            1619
46-55      Working Professional    2167
           Entrepreneur             799
56-70      Retired                 1130
           Entrepreneur             221
           Working Professional     106
           Student                   35
Name: count, dtype: int64

In [643]:
df_clean[(df_clean['age_group'] == '18-25') & (df_clean['occupation']=='Entrepreneur')]

Unnamed: 0,respondent_id,gender,zone,occupation,income_levels,consume_frequency(weekly),current_brand,preferable_consumption_size,awareness_of_other_brands,reasons_for_choosing_brands,flavor_preference,purchase_channel,packaging_preference,health_concerns,typical_consumption_situations,price_range,age_group,cf_ab_score,zas_score,bsi
25,R00026,F,4,Entrepreneur,4,3,Established,Large (1 L),3,Quality,Exotic,Online,Premium,Medium (Moderately health-conscious),Social (eg. Parties),200-250,18-25,0.50,16,0
58,R00059,F,3,Entrepreneur,3,2,Established,Medium (500 ml),1,Quality,Traditional,Online,Eco-Friendly,Low (Not very concerned),Social (eg. Parties),150-200,18-25,0.67,9,0
81,R00082,M,4,Entrepreneur,2,2,Established,Medium (500 ml),3,Price,Traditional,Retail Store,Premium,High (Very health-conscious),"Active (eg. Sports, gym)",200-250,18-25,0.40,8,0
113,R00114,F,3,Entrepreneur,3,3,Established,Medium (500 ml),3,Quality,Traditional,Retail Store,Premium,Low (Not very concerned),"Active (eg. Sports, gym)",200-250,18-25,0.50,9,0
159,R00160,M,1,Entrepreneur,2,1,Newcomer,Small (250 ml),3,Price,Traditional,Online,Simple,Low (Not very concerned),Casual (eg. At home),100-150,18-25,0.25,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29777,R29768,M,3,Entrepreneur,2,1,Newcomer,Medium (500 ml),1,Price,Exotic,Online,Simple,Medium (Moderately health-conscious),Social (eg. Parties),100-150,18-25,0.50,6,1
29852,R29843,M,2,Entrepreneur,3,3,Established,Medium (500 ml),1,Brand Reputation,Traditional,Retail Store,Eco-Friendly,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",100-150,18-25,0.75,6,0
29894,R29885,F,3,Entrepreneur,3,3,Newcomer,Medium (500 ml),2,Brand Reputation,Traditional,Online,Simple,Medium (Moderately health-conscious),Casual (eg. At home),200-250,18-25,0.60,9,0
29968,R29959,M,4,Entrepreneur,1,1,Established,Small (250 ml),1,Price,Traditional,Retail Store,Simple,High (Very health-conscious),Casual (eg. At home),100-150,18-25,0.50,4,0


In [644]:
df_clean = df_clean[~((df_clean['age_group'] == '56-70') & (df_clean['occupation']=='Student'))]
print(df_clean.shape)

(29956, 20)


In [645]:
df_clean['bsi'].value_counts()

bsi
0    20796
1     9160
Name: count, dtype: int64

In [646]:
df_clean.to_csv('cleaned_dataset.csv', index=False)

print("âœ… Successfully exported to 'cleaned_survey_results.csv'!")

âœ… Successfully exported to 'cleaned_survey_results.csv'!
