In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Generate synthetic dataset
np.random.seed(42)
user_data = pd.DataFrame({
    "User ID": np.arange(1, 501),
    "Gender": np.random.choice(["Male", "Female"], 500),
    "Age": np.random.randint(18, 65, 500),
    "EstimatedSalary": np.random.randint(20000, 150000, 500),
    "Purchased": np.random.choice([0, 1], 500)  # 0 = Not Purchased, 1 = Purchased
})

# Step 2: Convert categorical variable ('Gender') into numerical format
label_encoder = LabelEncoder()
user_data["Gender"] = label_encoder.fit_transform(user_data["Gender"])  # Male=1, Female=0

# Step 3: Define independent (X) and dependent (y) variables
X = user_data[["Gender", "Age", "EstimatedSalary"]]
y = user_data["Purchased"]

# Step 4: Split into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 5: Standardize features for better model performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 6: Train Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 7: Make predictions
y_pred = model.predict(X_test)

# Step 8: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}\n")
print("Classification Report:\n", classification_report(y_test, y_pred))

# Step 9: Display first few rows of dataset
print(user_data.head())


Model Accuracy: 0.46

Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.88      0.61        73
           1       0.36      0.06      0.11        77

    accuracy                           0.46       150
   macro avg       0.41      0.47      0.36       150
weighted avg       0.41      0.46      0.35       150

   User ID  Gender  Age  EstimatedSalary  Purchased
0        1       1   38           149668          0
1        2       0   49           124680          1
2        3       1   40            36907          1
3        4       1   50            38777          1
4        5       1   20           113229          0


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

np.random.seed(42)
data = {
    'ID' : np.arange(1, 501),
    'Gender' : np.random.choice(['Male', 'Female'], 500),
    'Age' : np.random.randint(18, 100, 500),
    'EstimatedSalary' : np.random.randint(10000, 50000, 500),
    'Purchased' : np.random.randint([0, 1], 500)    
}

dataset = pd.DataFrame(data)
dataset

ValueError: All arrays must be of the same length

In [7]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

np.random.seed(42)

data = {

    'User ID':np.arange(1,501),
    'Gender':np.random.choice(['Male','Female'],500),
    'Age':np.random.randint(18,60,500),
    'EstimatedSalary':np.random.randint(15000,150000,500),
    'Purchased':np.random.choice([0,1],500)
    
}

df = pd.DataFrame(data)

df['Gender'] = df['Gender'].map({'Male':0,'Female':1})

X=df[['Gender','Age','EstimatedSalary']]
y = df['Purchased']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)

model = LogisticRegression()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)
conf_matrix = confusion_matrix(y_test,y_pred)

print(f"Accuracy:{accuracy}")
print(f"Confusion Matrix\n{conf_matrix}")

Accuracy:0.56
Confusion Matrix
[[21 47]
 [19 63]]
