In [5]:
# Import the pandas library for file manipulation
import pandas as pd
# Set the maximum number of columns and rows to be displayed
pd.set_option('display.max_columns', 64)
pd.set_option('display.max_row', 64)

In [6]:
# Convert the file into a dataframe
file = 'Logistic Regression/data/Data_train_reduced.csv'
df = pd.read_csv(file)

FileNotFoundError: [Errno 2] No such file or directory: 'Logistic Regression/data/Data_train_reduced.csv'

In [None]:
# Drop unnecessary columns
df.drop(columns='Product', axis=1, inplace=True)
df.drop(columns='Respondent.ID', axis=1, inplace=True)
df.drop(columns='q1_1.personal.opinion.of.this.Deodorant', axis=1, inplace=True)

In [None]:
# Analyze the details of the file
print(df.shape)
print(df.dtypes)
print(df.head())

(2500, 61)
Product.ID                                          int64
Instant.Liking                                      int64
q2_all.words                                        int64
q3_1.strength.of.the.Deodorant                      int64
q4_1.artificial.chemical                            int64
                                                   ...   
s11.marital.status                                  int64
s12.working.status                                  int64
s13.2                                             float64
s13a.b.most.often                                   int64
s13b.bottles.of.Deodorant.do.you.currently.own      int64
Length: 61, dtype: object
   Product.ID  Instant.Liking  q2_all.words  q3_1.strength.of.the.Deodorant  \
0         121               1             1                               4   
1         121               0             1                               4   
2         121               0             1                               3   
3        

In [None]:
# Drop all columns that are of object type
for col in df.columns:
    if df[col].dtype == object:
        df.drop(columns=col, axis=1, inplace=True)
        
# Calculate the percentage of null values
percent = df.isnull().sum() / len(df) * 100
# Replace null values with the median
for col in df.columns:
    if percent[col] > 20:
        df.drop(columns=col, axis=1, inplace=True)
    elif 0 < percent[col] <= 20:
        df[col] = df[col].fillna(df[col].median())

Product.ID                                        0.0
Instant.Liking                                    0.0
q2_all.words                                      0.0
q3_1.strength.of.the.Deodorant                    0.0
q4_1.artificial.chemical                          0.0
                                                 ... 
s11.marital.status                                0.0
s12.working.status                                0.0
s13.2                                             0.0
s13a.b.most.often                                 0.0
s13b.bottles.of.Deodorant.do.you.currently.own    0.0
Length: 61, dtype: float64


In [None]:
# Create X and Y variables      
x = df.drop(columns='Instant.Liking', axis=1)
y = df['Instant.Liking']

In [None]:
# Import necessary functions from scikit-learn library
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

In [None]:
# Initialize the logistic regression model
model = LogisticRegression(max_iter=2000, tol=0.01)
# Initialize the StratifiedKFold function for cross-validation
stratifiedkfold = StratifiedKFold(n_splits=5)
# Calculate the result based on the cross-validation model
result = cross_val_score(model, x, y, cv=stratifiedkfold)

In [None]:
# Print the mean of the results
print(result.mean())

0.7524
