In [1]:
import pandas as pd

# Load the dataset
def load_data(data):
    return pd.read_csv(data.csv)

# Remove duplicate rows
def remove_duplicates(df):
    return df.drop_duplicates()

# Handle missing data
def handle_missing_data(df, method='drop', fill_value=None):
    if method == 'drop':
        return df.dropna()
    elif method == 'fill':
        return df.fillna(fill_value)
    else:
        return df

# Correct data types
def correct_data_types(df, column_type_dict):
    for column, dtype in column_type_dict.items():
        df[column] = df[column].astype(dtype)
    return df

# Filter irrelevant data
def filter_data(df, columns_to_keep):
    return df[columns_to_keep]

In [4]:
def filter_data(df, columns_to_keep):
    return df[columns_to_keep]

In [7]:
# Let's read the first few lines of the file to identify the delimiter and the general structure
with open('data.csv', 'r') as file:
    first_lines = [next(file) for _ in range(5)]

first_lines

['Row ID;Order ID;Order Date;Ship Date;Ship Mode;Customer ID;Customer Name;Segment;Country;City;State;Postal Code;Region;Product ID;Category;Sub-Category;Product Name;Cost;Price;Profit;Quantity;Sales\n',
 '1;CA-2017-152156;8/11/17;11/11/17;Second Class;CG-12520;Claire Gute;Consumer;United States;Henderson;Kentucky;42420;South;FUR-BO-10001798;Furniture;Bookcases;Bush Somerset Collection Bookcase;464.48;901.06;436.58;4;3.604.243.977\n',
 '2;CA-2017-152156;8/11/17;11/11/17;Second Class;CG-12520;Claire Gute;Consumer;United States;Henderson;Kentucky;42420;South;FUR-CH-10000454;Furniture;Chairs;Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back;756.10;138.70;-617.40;12;1.664.369.269\n',
 '3;CA-2017-138688;12/6/17;16/6/17;Second Class;DV-13045;Darrin Van Huff;Corporate;United States;Los Angeles;California;90036;West;OFF-LA-10000240;Office Supplies;Labels;Self-Adhesive Address Labels for Typewriters by Universal;537.68;159.28;-378.40;12;191.139.775\n',
 "4;US-2016-108966;11/10/16;18/1

In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = pd.read_csv('data.csv', delimiter=';')

# Convert date columns to datetime format
data['Order Date'] = pd.to_datetime(data['Order Date'], format='%d/%m/%y')
data['Ship Date'] = pd.to_datetime(data['Ship Date'], format='%d/%m/%y')

# Handle missing values in 'Postal Code' column by filling with the most frequent value
data['Postal Code'].fillna(data['Postal Code'].mode()[0], inplace=True)

# Convert 'Sales' column to numeric format
data['Sales'] = data['Sales'].str.replace('.', '').astype(float)

# Encode categorical variables using one-hot encoding
data_encoded = pd.get_dummies(data, columns=[
    'Ship Mode', 'Segment', 'Country', 'City', 'State', 'Region', 
    'Category', 'Sub-Category'
])

# Convert date columns to timestamps
data_encoded['Order Date'] = pd.to_numeric(data_encoded['Order Date'])
data_encoded['Ship Date'] = pd.to_numeric(data_encoded['Ship Date'])

# Select features and target
features = data_encoded.drop(columns=['Quantity', 'Profit', 'Order ID', 'Customer ID', 'Customer Name', 'Product ID', 'Product Name'])
target_quantity = data_encoded['Quantity'] > data_encoded['Quantity'].median()  # Binary classification: above median

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target_quantity, test_size=0.2, random_state=42)

# Define the parameter grid for Randomized Search
param_dist = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

# Initialize RandomizedSearchCV with cross-validation
random_search = RandomizedSearchCV(estimator=DecisionTreeClassifier(), param_distributions=param_dist, 
                                   n_iter=50, cv=5, n_jobs=-1, verbose=2, random_state=42)

# Fit RandomizedSearchCV to the data
random_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_params_random = random_search.best_params_
best_score_random = random_search.best_score_

# Train the Decision Tree model with the best parameters
best_decision_tree_model = DecisionTreeClassifier(**best_params_random)
best_decision_tree_model.fit(X_train, y_train)
decision_tree_preds = best_decision_tree_model.predict(X_test)

# Evaluate the model
decision_tree_accuracy = accuracy_score(y_test, decision_tree_preds)
decision_tree_report = classification_report(y_test, decision_tree_preds)

print(f"Best Parameters: {best_params_random}")
print(f"Best Cross-Validation Score: {best_score_random}")
print(f"Decision Tree Accuracy: {decision_tree_accuracy}")
print(f"Classification Report:\n{decision_tree_report}")


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters: {'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 10, 'criterion': 'entropy'}
Best Cross-Validation Score: 0.9075941289087428
Decision Tree Accuracy: 0.9198570699336396
Classification Report:
              precision    recall  f1-score   support

       False       0.95      0.89      0.92      1008
        True       0.89      0.95      0.92       951

    accuracy                           0.92      1959
   macro avg       0.92      0.92      0.92      1959
weighted avg       0.92      0.92      0.92      1959



In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error

# Load the dataset
data = pd.read_csv('data.csv', delimiter=';')

# Convert date columns to datetime format
data['Order Date'] = pd.to_datetime(data['Order Date'], format='%d/%m/%y')
data['Ship Date'] = pd.to_datetime(data['Ship Date'], format='%d/%m/%y')

# Handle missing values in 'Postal Code' column by filling with the most frequent value
data['Postal Code'].fillna(data['Postal Code'].mode()[0], inplace=True)

# Convert 'Sales' column to numeric format
data['Sales'] = data['Sales'].str.replace('.', '').astype(float)

# Encode categorical variables using one-hot encoding
data_encoded = pd.get_dummies(data, columns=[
    'Ship Mode', 'Segment', 'Country', 'City', 'State', 'Region', 
    'Category', 'Sub-Category'
])

# Convert date columns to timestamps
data_encoded['Order Date'] = pd.to_numeric(data_encoded['Order Date'])
data_encoded['Ship Date'] = pd.to_numeric(data_encoded['Ship Date'])

# Select features and target
features = data_encoded.drop(columns=['Order ID', 'Customer ID', 'Customer Name', 'Product ID', 'Product Name'])
target_quantity = data_encoded['Quantity'] > data_encoded['Quantity'].median()  # Binary classification: above median
target_sales = data_encoded['Sales']  # Regression: predicting sales

# Split the data into training and testing sets
X_train, X_test, y_train_class, y_test_class = train_test_split(features, target_quantity, test_size=0.2, random_state=42)
_, _, y_train_reg, y_test_reg = train_test_split(features, target_sales, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_scaled, y_train_class)
logistic_preds = logistic_model.predict(X_test_scaled)

# Evaluate the Logistic Regression model
logistic_accuracy = accuracy_score(y_test_class, logistic_preds)
logistic_report = classification_report(y_test_class, logistic_preds)

print(f"Logistic Regression Accuracy: {logistic_accuracy}")
print(f"Classification Report:\n{logistic_report}")

# Train the Linear Regression model
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train_scaled, y_train_reg)
linear_reg_preds = linear_reg_model.predict(X_test_scaled)

# Evaluate the Linear Regression model
regression_mse = mean_squared_error(y_test_reg, linear_reg_preds)

print(f"Linear Regression Mean Squared Error: {regression_mse}")


Logistic Regression Accuracy: 0.9851965288412455
Classification Report:
              precision    recall  f1-score   support

       False       0.98      0.99      0.99      1008
        True       0.99      0.98      0.98       951

    accuracy                           0.99      1959
   macro avg       0.99      0.99      0.99      1959
weighted avg       0.99      0.99      0.99      1959

Linear Regression Mean Squared Error: 2169617738918666.0


In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error

# Load the dataset
data = pd.read_csv('data.csv', delimiter=';')

# Convert date columns to datetime format
data['Order Date'] = pd.to_datetime(data['Order Date'], format='%d/%m/%y')
data['Ship Date'] = pd.to_datetime(data['Ship Date'], format='%d/%m/%y')

# Handle missing values in 'Postal Code' column by filling with the most frequent value
data['Postal Code'].fillna(data['Postal Code'].mode()[0], inplace=True)

# Convert 'Sales' column to numeric format
data['Sales'] = data['Sales'].str.replace('.', '').astype(float)

# Encode categorical variables using one-hot encoding
data_encoded = pd.get_dummies(data, columns=[
    'Ship Mode', 'Segment', 'Country', 'City', 'State', 'Region', 
    'Category', 'Sub-Category'
])

# Convert date columns to timestamps
data_encoded['Order Date'] = pd.to_numeric(data_encoded['Order Date'])
data_encoded['Ship Date'] = pd.to_numeric(data_encoded['Ship Date'])

# Select features and target
features = data_encoded.drop(columns=['Order ID', 'Customer ID', 'Customer Name', 'Product ID', 'Product Name'])
target_quantity = data_encoded['Quantity'] > data_encoded['Quantity'].median()  # Binary classification: above median
target_sales = data_encoded['Sales']  # Regression: predicting sales

# Split the data into training and testing sets
X_train, X_test, y_train_class, y_test_class = train_test_split(features, target_quantity, test_size=0.2, random_state=42)
_, _, y_train_reg, y_test_reg = train_test_split(features, target_sales, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_scaled, y_train_class)
logistic_preds = logistic_model.predict(X_test_scaled)

# Evaluate the Logistic Regression model
logistic_accuracy = accuracy_score(y_test_class, logistic_preds)
logistic_report = classification_report(y_test_class, logistic_preds)

print(f"Logistic Regression Accuracy: {logistic_accuracy}")
print(f"Classification Report:\n{logistic_report}")

# Train and evaluate Linear Regression model
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train_scaled, y_train_reg)
linear_reg_preds = linear_reg_model.predict(X_test_scaled)
linear_reg_mse = mean_squared_error(y_test_reg, linear_reg_preds)
print(f"Linear Regression Mean Squared Error: {linear_reg_mse}")

# Train and evaluate Decision Tree Regressor
tree_reg_model = DecisionTreeRegressor()
tree_reg_model.fit(X_train_scaled, y_train_reg)
tree_reg_preds = tree_reg_model.predict(X_test_scaled)
tree_reg_mse = mean_squared_error(y_test_reg, tree_reg_preds)
print(f"Decision Tree Regressor Mean Squared Error: {tree_reg_mse}")

# Train and evaluate Random Forest Regressor
forest_reg_model = RandomForestRegressor(n_estimators=100)
forest_reg_model.fit(X_train_scaled, y_train_reg)
forest_reg_preds = forest_reg_model.predict(X_test_scaled)
forest_reg_mse = mean_squared_error(y_test_reg, forest_reg_preds)
print(f"Random Forest Regressor Mean Squared Error: {forest_reg_mse}")

# Train and evaluate Gradient Boosting Regressor
gb_reg_model = GradientBoostingRegressor(n_estimators=100)
gb_reg_model.fit(X_train_scaled, y_train_reg)
gb_reg_preds = gb_reg_model.predict(X_test_scaled)
gb_reg_mse = mean_squared_error(y_test_reg, gb_reg_preds)
print(f"Gradient Boosting Regressor Mean Squared Error: {gb_reg_mse}")


Logistic Regression Accuracy: 0.9851965288412455
Classification Report:
              precision    recall  f1-score   support

       False       0.98      0.99      0.99      1008
        True       0.99      0.98      0.98       951

    accuracy                           0.99      1959
   macro avg       0.99      0.99      0.99      1959
weighted avg       0.99      0.99      0.99      1959

Linear Regression Mean Squared Error: 2169617738918666.0
Decision Tree Regressor Mean Squared Error: 9150833420702.795
Random Forest Regressor Mean Squared Error: 2330352269850.9434
Gradient Boosting Regressor Mean Squared Error: 350185049956428.9
