In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Load the data
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/processed_problem_1_data.csv")

# Preprocessing
# Drop rows with missing target variable values
data.dropna(subset=["MILK", "MEAT"], inplace=True)

# Convert continuous target variables to binary labels
data["MILK"] = data["MILK"].apply(lambda x: 'Y' if x > 0 else 'N')
data["MEAT"] = data["MEAT"].apply(lambda x: 'Y' if x > 0 else 'N')

# Split features and target variable
X = data.drop(columns=["MAGIC_KEY", "MILK", "MEAT"])
y_milk = data["MILK"]
y_meat = data["MEAT"]

# Encode categorical features
categorical_cols = ["QUALITY", "DELIVERY_OPTION"]
X_encoded = pd.get_dummies(X, columns=categorical_cols)

# Impute missing values in features
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X_encoded)

# Split data into training and testing sets
X_train_milk, X_test_milk, y_train_milk, y_test_milk = train_test_split(X_imputed, y_milk, test_size=0.2, random_state=42)
X_train_meat, X_test_meat, y_train_meat, y_test_meat = train_test_split(X_imputed, y_meat, test_size=0.2, random_state=42)

# Model training
milk_classifier = RandomForestClassifier()
milk_classifier.fit(X_train_milk, y_train_milk)

meat_classifier = RandomForestClassifier()
meat_classifier.fit(X_train_meat, y_train_meat)

# Predictions
milk_predictions = milk_classifier.predict(X_test_milk)
meat_predictions = meat_classifier.predict(X_test_meat)

# Evaluation
milk_accuracy = accuracy_score(y_test_milk, milk_predictions)
meat_accuracy = accuracy_score(y_test_meat, meat_predictions)

print("Accuracy for predicting milk purchases:", milk_accuracy)
print("Accuracy for predicting meat purchases:", meat_accuracy)

# Make predictions on the provided data
predictions_milk = milk_classifier.predict(X_imputed)
predictions_meat = meat_classifier.predict(X_imputed)

# Assigning "Y" or "N" based on predictions
data["MILK_PURCHASE"] = predictions_milk
data["MEAT_PURCHASE"] = predictions_meat

# Save the results
data.to_csv("/content/drive/MyDrive/Colab Notebooks/predictions.csv", index=False)


Accuracy for predicting milk purchases: 0.9482758620689655
Accuracy for predicting meat purchases: 0.9482758620689655


In [None]:
import pandas as pd

# Load the data with predictions
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/predictions.csv")

# Create the new column "PURCHASE"
data['PURCHASE'] = data.apply(lambda row: 'Y' if row['MILK_PURCHASE'] == 'Y' or row['MEAT_PURCHASE'] == 'Y' else 'N', axis=1)

# Select only "MAGIC_KEY" and "PURCHASE" columns
submission_data = data[["MAGIC_KEY", "PURCHASE"]]

# Save to a new CSV file
submission_data.to_csv("/content/drive/MyDrive/Colab Notebooks/submission_1.csv", index=False)
