In [None]:
from google.colab import files
uploaded = files.upload()
import pandas as pd
file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name)
df.drop(['Age', 'Gender', 'Item Purchased', 'Location', 'Size', 'Color', 'Shipping Type', 'Discount Applied', 'Promo Code Used', 'Previous Purchases', 'Payment Method', 'Frequency of Purchases'], axis=1, inplace=True)
modified_file_name = 'pps1.csv'
df.to_csv(modified_file_name, index=False)
files.download(modified_file_name)

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE

data = pd.read_csv('pps1.csv')

label_encoder_category = LabelEncoder()
label_encoder_season = LabelEncoder()
label_encoder_subscription = LabelEncoder()

data['Category'] = label_encoder_category.fit_transform(data['Category'])
data['Season'] = label_encoder_season.fit_transform(data['Season'])
data['Subscription Status'] = label_encoder_subscription.fit_transform(data['Subscription Status'])

X = data.drop(columns=['Customer ID', 'Subscription Status'])
y = data['Subscription Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_balanced = scaler.fit_transform(X_train_balanced)
X_test = scaler.transform(X_test)

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_balanced, y_train_balanced)

y_pred = knn_model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=1))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"Precision: {precision_score(y_test, y_pred, zero_division=1):.2f}")
print(f"Recall: {recall_score(y_test, y_pred, zero_division=1):.2f}")
print(f"F1 Score: {f1_score(y_test, y_pred, zero_division=1):.2f}")

def predict_subscription():
    category = input("Enter product category (e.g., Clothing, Footwear): ").title()
    purchase_amount = float(input("Enter purchase amount (USD): "))
    season = input("Enter season (e.g., Winter, Spring): ").title()
    review_rating = float(input("Enter review rating (e.g., 3.5): "))


    if category not in label_encoder_category.classes_ or season not in label_encoder_season.classes_:
        print(f"Error: Category '{category}' or Season '{season}' not found in training data.")
        return


    category_encoded = label_encoder_category.transform([category])[0]
    season_encoded = label_encoder_season.transform([season])[0]


    input_data = [[category_encoded, purchase_amount, season_encoded, review_rating]]
    input_data_scaled = scaler.transform(input_data)

    prediction = knn_model.predict(input_data_scaled)

    if prediction[0] == 1:
        print("Prediction: The customer will subscribe.")
    else:
        print("Prediction: The customer will not subscribe.")

predict_subscription()

Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.55      0.62       834
           1       0.28      0.45      0.35       336

    accuracy                           0.52      1170
   macro avg       0.50      0.50      0.48      1170
weighted avg       0.59      0.52      0.54      1170

Accuracy: 0.52
Precision: 0.28
Recall: 0.45
F1 Score: 0.35
Enter product category (e.g., Clothing, Footwear): clothing
Enter purchase amount (USD): 37
Enter season (e.g., Winter, Spring): Spring
Enter review rating (e.g., 3.5): 3.1
Prediction: The customer will subscribe.




In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# Load and preprocess the dataset
data = pd.read_csv('pps1.csv')
label_encoder_category = LabelEncoder()
label_encoder_season = LabelEncoder()
label_encoder_subscription = LabelEncoder()

# Encoding categorical variables
data['Category'] = label_encoder_category.fit_transform(data['Category'])
data['Season'] = label_encoder_season.fit_transform(data['Season'])
data['Subscription Status'] = label_encoder_subscription.fit_transform(data['Subscription Status'])

# Defining features and target
X = data.drop(columns=['Customer ID', 'Subscription Status'])
y = data['Subscription Status']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Balance classes with SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Scale features
scaler = StandardScaler()
X_train_balanced = scaler.fit_transform(X_train_balanced)
X_test = scaler.transform(X_test)

# Model tuning with Random Forest
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train_balanced, y_train_balanced)

# Best model from grid search
best_rf_model = grid_search.best_estimator_
best_rf_model.fit(X_train_balanced, y_train_balanced)

# Make predictions and evaluate
y_pred = best_rf_model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=1))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"Precision: {precision_score(y_test, y_pred, zero_division=1):.2f}")
print(f"Recall: {recall_score(y_test, y_pred, zero_division=1):.2f}")
print(f"F1 Score: {f1_score(y_test, y_pred, zero_division=1):.2f}")

# Prediction function remains the same as before
def predict_subscription():
    category = input("Enter product category (e.g., Clothing, Footwear): ").title()
    purchase_amount = float(input("Enter purchase amount (USD): "))
    season = input("Enter season (e.g., Winter, Spring): ").title()
    review_rating = float(input("Enter review rating (e.g., 3.5): "))


    if category not in label_encoder_category.classes_ or season not in label_encoder_season.classes_:
        print(f"Error: Category '{category}' or Season '{season}' not found in training data.")
        return


    category_encoded = label_encoder_category.transform([category])[0]
    season_encoded = label_encoder_season.transform([season])[0]


    input_data = [[category_encoded, purchase_amount, season_encoded, review_rating]]
    input_data_scaled = scaler.transform(input_data)

    prediction = knn_model.predict(input_data_scaled)

    if prediction[0] == 1:
        print("Prediction: The customer will subscribe.")
    else:
        print("Prediction: The customer will not subscribe.")

predict_subscription()

Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.74      0.72       834
           1       0.27      0.24      0.26       336

    accuracy                           0.60      1170
   macro avg       0.49      0.49      0.49      1170
weighted avg       0.58      0.60      0.59      1170

Accuracy: 0.60
Precision: 0.27
Recall: 0.24
F1 Score: 0.26
Enter product category (e.g., Clothing, Footwear): clothing
Enter purchase amount (USD): 67
Enter season (e.g., Winter, Spring): winter
Enter review rating (e.g., 3.5): 3.1
Prediction: The customer will not subscribe.




In [4]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE

# Load the dataset
data = pd.read_csv('/content/duplicatepps1.csv')

# Initialize label encoders
label_encoder_category = LabelEncoder()
label_encoder_season = LabelEncoder()
label_encoder_subscription = LabelEncoder()

# Encode categorical variables
data['Category'] = label_encoder_category.fit_transform(data['Category'])
data['Season'] = label_encoder_season.fit_transform(data['Season'])
data['Subscription Status'] = label_encoder_subscription.fit_transform(data['Subscription Status'])

# Define features (X) and target (y)
X = data.drop(columns=['Customer ID', 'Subscription Status'])
y = data['Subscription Status']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply SMOTE to balance the classes in the training set
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Standardize the features
scaler = StandardScaler()
X_train_balanced_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)

# Convert the processed training and test sets to DataFrames for saving
X_train_balanced_df = pd.DataFrame(X_train_balanced_scaled, columns=X.columns)
X_train_balanced_df['Subscription Status'] = y_train_balanced.values

X_test_df = pd.DataFrame(X_test_scaled, columns=X.columns)
X_test_df['Subscription Status'] = y_test.values

# Save the processed datasets as CSV files
X_train_balanced_df.to_csv('processed_train_data.csv', index=False)
X_test_df.to_csv('processed_test_data.csv', index=False)

print("Processed datasets have been saved as 'processed_train_data.csv' and 'processed_test_data.csv'.")


FileNotFoundError: [Errno 2] No such file or directory: '/content/duplicatepps1.csv'

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the processed datasets
train_data = pd.read_csv('processed_train_data.csv')
test_data = pd.read_csv('processed_test_data.csv')

# Separate features and target for training
X_train = train_data.drop(columns=['Subscription Status'])
y_train = train_data['Subscription Status']

# Separate features and target for testing
X_test = test_data.drop(columns=['Subscription Status'])
y_test = test_data['Subscription Status']

# Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

def predict_subscription():
    # Gather user input
    category = input("Enter product category (e.g., Clothing, Footwear): ").title()
    purchase_amount = float(input("Enter purchase amount (USD): "))
    season = input("Enter season (e.g., Winter, Spring): ").title()
    review_rating = float(input("Enter review rating (e.g., 3.5): "))

    # Check if the entered category and season exist in the training data
    if category not in label_encoder_category.classes_ or season not in label_encoder_season.classes_:
        print(f"Error: Category '{category}' or Season '{season}' not found in training data.")
        return

    # Encode categorical variables
    category_encoded = label_encoder_category.transform([category])[0]
    season_encoded = label_encoder_season.transform([season])[0]

    # Prepare input data for prediction
    input_data = [[category_encoded, purchase_amount, season_encoded, review_rating]]

    # Scale the input data
    input_data_scaled = scaler.transform(input_data)

    # Make prediction
    prediction = knn_model.predict(input_data_scaled)

    # Display the prediction result
    if prediction[0] == 1:
        print("Prediction: The customer will subscribe.")
    else:
        print("Prediction: The customer will not subscribe.")

# Call the function to get predictions
predict_subscription()

Accuracy: 0.59
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.73      0.72       834
           1       0.27      0.25      0.26       336

    accuracy                           0.59      1170
   macro avg       0.49      0.49      0.49      1170
weighted avg       0.58      0.59      0.59      1170

Enter product category (e.g., Clothing, Footwear): footwear
Enter purchase amount (USD): 78
