In [1]:
import sqlite3
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [2]:
# Define the list of attributes and table names
# Example of how to fetch data from multiple tables
# attributes = ["t1.attribute1", "t2.attribute2", "t3.attribute3", "t4.attribute4"] # Any attribute names that we want to fetch
# table_names = ["table1", "table2", "table3", "table4"]  # The table names that we want to join
# join_conditions = ["t1.id = t2.id", "t1.id = t3.id", "t1.id = t4.id"]  # The join conditions for the tables

In [3]:
# Example of how to fetch data from a single table
attributes = ["t1.year", "t1.playoff"] # Any attribute names that we want to fetch
table_names = ["teams"]  # The table names that we want to join
join_conditions = []  # The join conditions for the tables

# Removing the table names from attributes
cleaned_attributes = [attr.split('.')[1] for attr in attributes]

In [4]:
# Connect to the existing SQLite database
conn = sqlite3.connect('db/database.db')

# Generate the select and join clauses dynamically
select_clause = ", ".join(attributes)
join_clause = f"{table_names[0]} t1"
for i in range(1, len(table_names)):
    join_clause += f" JOIN {table_names[i]} t{i+1} ON {join_conditions[i-1]}"

# Create the complete query string
query = f"SELECT {select_clause} FROM {join_clause}"

# Execute the query
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()

# Convert the fetched data to a pandas dataframe
df = pd.DataFrame(rows, columns=[attribute.split('.')[1] for attribute in attributes])

# Close the connection
conn.close()

In [5]:
df_first_five_years = df[df['year'] <= 5]  # Get the first five years of data

In [6]:
target = 'playoff'  # The target attribute
seed = 42  # The random seed for the train-test split
train_ratio = 0.8  #   80% of the data is used for training and 20% for testing (can be changed as needed)

# Perform the train-test split, ensuring the random split is the same in various runs
x = df_first_five_years.drop(columns=[target])
y = df_first_five_years[ target]
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=train_ratio, stratify=y, random_state=seed)

print(f"Shape of X_train: {x_train.shape}")
print(f"Shape of X_test: {x_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_train: (60, 1)
Shape of X_test: (15, 1)
Shape of y_train: (60,)
Shape of y_test: (15,)


In [7]:
# Define the pipeline with different models
# The names of the models will be used to retrieve the model from the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=seed))   # We can use any classifier here to test multiple models
                                                                # We can also use SVC or any other classifier here
                                                                # Maybe we should use a different seed from the split seed (using: seed=42)
])

# Fit the pipeline with the training data
pipeline.fit(x_train, y_train)

Accuracy: 0.4666666666666667


In [9]:
# Now we use the method pipeline.named_steps['scaler'] to retrieve data from the pipeline
scaler_model = pipeline.named_steps['scaler']      # This will return the scaler object
classifier_model = pipeline.named_steps['classifier']  # This will return the classifier object
# Use this to test n models and compare them later