**Load the dataset and perform necessary data preprocessing.**

In [65]:
#import main data analysis libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Ignore warnings
import warnings
warnings.filterwarnings("ignore")

#Since we have different algorithms, we use different sklearn libraries
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from sklearn.metrics import precision_score
from sklearn.metrics import  recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [66]:
#Load data and view the first three rows.
data=pd.read_csv('/content/gender classification.csv')
data.head(3)

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink,Gender
0,Cool,Rock,Vodka,7UP/Sprite,F
1,Neutral,Hip hop,Vodka,Coca Cola/Pepsi,F
2,Warm,Rock,Wine,Coca Cola/Pepsi,F


In [67]:
#Checking shape of the data
data.shape
#They are 66 rows and 5 column

(66, 5)

In [68]:
#Checking the missing values in the dataset
data.isnull().sum()
#Dataset has no missing values.

Favorite Color          0
Favorite Music Genre    0
Favorite Beverage       0
Favorite Soft Drink     0
Gender                  0
dtype: int64

In [69]:
#Check the data types
data.dtypes

#Dataset consist of object data type.

Favorite Color          object
Favorite Music Genre    object
Favorite Beverage       object
Favorite Soft Drink     object
Gender                  object
dtype: object

In [70]:
# Perform encoding for categorical variables
data_dummies = pd.get_dummies(data, columns = ['Favorite Color','Favorite Music Genre',
                                               'Favorite Beverage','Favorite Soft Drink'])

In [71]:
# Separate features (X) and target variable (y)
x = data_dummies.drop('Gender', axis=1)  # Assuming 'gender' is the target variable
y = data_dummies['Gender']

# Split the data into train and test sets
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=42)

In [72]:
# Shuffling the Dataset
data = shuffle(data, random_state = 42)

#creating 4 divisions
div = int(data.shape[0]/4)

# 3 parts to train set and 1 part to test set
train = data.loc[:3*div+1,:]
test = data.loc[3*div+1:]

train.shape, test.shape

((46, 5), (21, 5))

**Explore the data to understand its features and characteristics**

In [73]:
#Viewing the first three rows of train
train.head(3)

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink,Gender
54,Cool,Electronic,Doesn't drink,Fanta,M
62,Cool,Hip hop,Beer,Coca Cola/Pepsi,M
0,Cool,Rock,Vodka,7UP/Sprite,F


In [74]:
#Viewing the first three rows of test
test.head(3)

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink,Gender
49,Warm,Hip hop,Beer,Coca Cola/Pepsi,M
1,Neutral,Hip hop,Vodka,Coca Cola/Pepsi,F
21,Warm,Jazz/Blues,Whiskey,Fanta,F


In [75]:
#Data information
print(data.info())

# Summary Statistics
print(data.describe())



<class 'pandas.core.frame.DataFrame'>
Int64Index: 66 entries, 54 to 51
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Favorite Color        66 non-null     object
 1   Favorite Music Genre  66 non-null     object
 2   Favorite Beverage     66 non-null     object
 3   Favorite Soft Drink   66 non-null     object
 4   Gender                66 non-null     object
dtypes: object(5)
memory usage: 5.1+ KB
None
       Favorite Color Favorite Music Genre Favorite Beverage  \
count              66                   66                66   
unique              3                    7                 6   
top              Cool                 Rock     Doesn't drink   
freq               37                   19                14   

       Favorite Soft Drink Gender  
count                   66     66  
unique                   4      2  
top        Coca Cola/Pepsi      M  
freq                    32     33  


**Build a baseline classification model to predict gender based on the provided features**

In [103]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the model
model = LogisticRegression()
model.fit(train_x, train_y)

# Make predictions on the test set
pred_y = model.predict(test_x)


In [77]:
#Simple mode
test['simple_mode'] = train['Gender'].mode()[0]
test['simple_mode'].head()

49    F
1     F
21    F
2     F
53    F
Name: simple_mode, dtype: object

**Evaluate the model's performance using appropriate classification metrics such as accuracy, precision, recall, and F1-score.**

In [78]:
#Calculate the simple accuracy
simple_mode_accuracy = accuracy_score(test['Gender'], test['simple_mode'])
simple_mode_accuracy

0.47619047619047616

In [107]:
# Calculate classification metrics
accuracy = accuracy_score(test_y, pred_y)
precision = precision_score(test_y, pred_y, average='binary', pos_label='F')
recall = recall_score(test_y, pred_y, average='binary', pos_label= 'F')
f1 = f1_score(test_y, pred_y, average='binary', pos_label='F')

# Print the metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")


Accuracy: 0.7142857142857143
Precision: 0.8
Recall: 0.5714285714285714
F1-score: 0.6666666666666666


**Mode based on  favorite color**

In [79]:
#Mode based on  favorite color
favorite_color_mode = pd.crosstab(train['Gender'],train['Favorite Color'])
favorite_color_mode

Favorite Color,Cool,Neutral,Warm
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F,12,2,9
M,15,4,4


In [80]:
test['favorite_color_mode'] = test['Gender']

# for every unique value in column
for i in test['Favorite Color'].unique():
# Calculate and Assign mode to new column, corresponding to unique values in "Sex"
  test['favorite_color_mode'][test['Favorite Color'] == str(i)] = train['Gender'][train['Favorite Color'] == str(i)].mode()[0]

In [81]:
#Evaluate the accuracy of the model
favorite_color_accuracy = accuracy_score(test['Gender'], test['favorite_color_mode'])
favorite_color_accuracy

0.42857142857142855

In [82]:
#Evaluate the precision score of the model
favorite_color_precision = precision_score(test['Gender'], test['favorite_color_mode'], average='binary', pos_label='F')
favorite_color_precision

0.4

In [83]:
#Calculating color recall
favorite_color_recall = recall_score(test['Gender'], test['favorite_color_mode'], average='binary', pos_label = 'F')
favorite_color_recall

0.4

In [84]:
#Calcukating color f1
favorite_color_f1 = f1_score(test['Gender'], test['favorite_color_mode'], average='binary', pos_label = 'F')
favorite_color_f1

0.4000000000000001

**Mode Based on Favorite Music Genre**

In [85]:
#Getting music mode
music_mode = pd.crosstab(train['Gender'],train['Favorite Music Genre'])
music_mode

Favorite Music Genre,Electronic,Folk/Traditional,Hip hop,Jazz/Blues,Pop,R&B and soul,Rock
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
F,2,2,0,2,9,2,6
M,2,2,6,1,3,3,6


In [86]:
test['music_mode'] = test['Gender']

# for every unique value in column
for i in test['Favorite Music Genre'].unique():
# Calculate and Assign mode to new column, corresponding to unique values in "Sex"
  test['music_mode'][test['Favorite Music Genre'] == str(i)] = train['Gender'][train['Favorite Music Genre'] == str(i)].mode()[0]

In [87]:
#Evaluate the accuracy of the model
music_accuracy = accuracy_score(test['Gender'], test['music_mode'])
music_accuracy

0.5714285714285714

In [88]:
#Evaluate the precision score of the model
music_precision = precision_score(test['Gender'], test['music_mode'], average='binary', pos_label='F')
music_precision

0.5294117647058824

In [89]:
#Calculate music recall
music_recall = recall_score(test['Gender'], test['music_mode'], average='binary', pos_label = 'F')
music_recall

0.9

In [90]:
#Evaluate f1
music_f1 = f1_score(test['Gender'], test['music_mode'], average='binary', pos_label = 'F')
music_f1

0.6666666666666667

**Mode Based on Favorite Beverage**

In [91]:
#Getting becerage mode
beverage_mode = pd.crosstab(train['Gender'],train['Favorite Beverage'])
beverage_mode

Favorite Beverage,Beer,Doesn't drink,Other,Vodka,Whiskey,Wine
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
F,4,5,4,2,3,5
M,6,6,2,3,2,4


In [92]:
test['beverage_mode'] = test['Gender']

# for every unique value in column
for i in test['Favorite Beverage'].unique():
# Calculate and Assign mode to new column, corresponding to unique values in "Sex"
  test['beverage_mode'][test['Favorite Beverage'] == str(i)] = train['Gender'][train['Favorite Beverage'] == str(i)].mode()[0]

In [93]:
#Evaluate the accuracy of the model
beverage_accuracy = accuracy_score(test['Gender'], test['beverage_mode'])
beverage_accuracy

0.6190476190476191

In [94]:
#Evaluate the precision score of the model
beverage_precision = precision_score(test['Gender'], test['beverage_mode'], average='binary', pos_label='F')
beverage_precision

0.6

In [95]:
#Calculate beverage recall
beverage_recall = recall_score(test['Gender'], test['beverage_mode'], average='binary', pos_label = 'F')
beverage_recall

0.6

In [96]:
#Evaluate f1
beverage_f1 = f1_score(test['Gender'], test['beverage_mode'], average='binary', pos_label = 'F')
beverage_f1

0.6

**Mode Based on Favorite Soft Drink**

In [97]:
#Soft drink mode
soft_drink_mode = pd.crosstab(train['Gender'],train['Favorite Soft Drink'])
soft_drink_mode

Favorite Soft Drink,7UP/Sprite,Coca Cola/Pepsi,Fanta,Other
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
F,7,11,3,2
M,4,11,4,4


In [98]:
test['soft_drink_mode'] = test['Gender']

# for every unique value in column
for i in test['Favorite Soft Drink'].unique():
# Calculate and Assign mode to new column, corresponding to unique values in "Sex"
  test['soft_drink_mode'][test['Favorite Soft Drink'] == str(i)] = train['Gender'][train['Favorite Soft Drink'] == str(i)].mode()[0]

In [99]:
#Evaluate the accuracy of the model
soft_drink_accuracy = accuracy_score(test['Gender'], test['soft_drink_mode'])
soft_drink_accuracy

0.5714285714285714

In [100]:
#Evaluate the precision score of the model
soft_drink_precision = precision_score(test['Gender'], test['soft_drink_mode'], average='binary', pos_label='F')
soft_drink_precision

0.5384615384615384

In [101]:
#Calculate recall
soft_drink_recall = recall_score(test['Gender'], test['soft_drink_mode'], average='binary', pos_label = 'F')
soft_drink_recall

0.7

In [102]:
#Evaluate f1
soft_drink_f1 = f1_score(test['Gender'], test['soft_drink_mode'], average='binary', pos_label = 'F')
soft_drink_f1

0.608695652173913