In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv("/content/Data.csv")

In [3]:
print("Missing values:\n", data.isnull().sum())

Missing values:
 Favorite Color          0
Favorite Music Genre    0
Favorite Beverage       0
Favorite Soft Drink     0
Gender                  0
dtype: int64


In [4]:
print("Favorite Color Distribution:\n", data['Favorite Color'].value_counts())
print("Favorite Music Genre Distribution:\n", data['Favorite Music Genre'].value_counts())
print("Favorite Beverage Distribution:\n", data['Favorite Beverage'].value_counts())
print("Favorite Soft Drink Distribution:\n", data['Favorite Soft Drink'].value_counts())

Favorite Color Distribution:
 Favorite Color
Cool       37
Warm       22
Neutral     7
Name: count, dtype: int64
Favorite Music Genre Distribution:
 Favorite Music Genre
Rock                19
Pop                 17
Hip hop              8
Electronic           8
R&B and soul         6
Folk/Traditional     4
Jazz/Blues           4
Name: count, dtype: int64
Favorite Beverage Distribution:
 Favorite Beverage
Doesn't drink    14
Beer             13
Other            11
Wine             10
Vodka             9
Whiskey           9
Name: count, dtype: int64
Favorite Soft Drink Distribution:
 Favorite Soft Drink
Coca Cola/Pepsi    32
Fanta              14
7UP/Sprite         13
Other               7
Name: count, dtype: int64


In [5]:
print("Gender Distribution:\n", data['Gender'].value_counts())
gender_color = data.groupby(['Gender', 'Favorite Color']).size().unstack(fill_value=0)
print("Gender vs Favorite Color:\n", gender_color)
gender_music = data.groupby(['Gender', 'Favorite Music Genre']).size().unstack(fill_value=0)
print("Gender vs Favorite Music Genre:\n", gender_music)
gender_beverage = data.groupby(['Gender', 'Favorite Beverage']).size().unstack(fill_value=0)
print("Gender vs Favorite Beverage:\n", gender_beverage)
gender_softdrink = data.groupby(['Gender', 'Favorite Soft Drink']).size().unstack(fill_value=0)
print("Gender vs Favorite Soft Drink:\n", gender_softdrink)

Gender Distribution:
 Gender
F    33
M    33
Name: count, dtype: int64
Gender vs Favorite Color:
 Favorite Color  Cool  Neutral  Warm
Gender                             
F                 17        3    13
M                 20        4     9
Gender vs Favorite Music Genre:
 Favorite Music Genre  Electronic  Folk/Traditional  Hip hop  Jazz/Blues  Pop  \
Gender                                                                         
F                              2                 2        1           3   13   
M                              6                 2        7           1    4   

Favorite Music Genre  R&B and soul  Rock  
Gender                                    
F                                2    10  
M                                4     9  
Gender vs Favorite Beverage:
 Favorite Beverage  Beer  Doesn't drink  Other  Vodka  Whiskey  Wine
Gender                                                             
F                     6              5      7      4        5     

In [6]:
data_encoded = pd.get_dummies(data, columns=['Favorite Color', 'Favorite Music Genre', 'Favorite Beverage', 'Favorite Soft Drink'])

In [7]:
X = data_encoded.drop('Gender', axis=1)
y = data_encoded['Gender']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

In [10]:
y_pred = model.predict(X_test_scaled)

In [11]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the logistic regression model:", accuracy)

Accuracy of the logistic regression model: 0.5
