In [None]:
import seaborn
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
# inspect data and check for missing elements
data = pd.read_csv('tennis_data.csv')
print(data.info())
print(data.describe())

In [None]:
# fit simple regression and output results if test R^2>=0.8
def single_regression(feature, response):
    lm = LinearRegression()
    x_train, x_test, y_train, y_test = train_test_split(data[[feature]], data[[response]], train_size=0.8, test_size=0.2)
    lm.fit(x_train, y_train)
    y_predict = lm.predict(x_test)
    train_score = lm.score(x_train, y_train)
    test_score = lm.score(x_test, y_test)

    if test_score >= 0.8 and train_score >= 0.8:
        print(f"Training R^2 for {feature} vs {response}: {train_score}")
        print(f"Test R^2 for {feature} vs {response}: {test_score}")
        plt.scatter(x_train, y_train)
        plt.scatter(x_test, y_test)
        plt.xlabel(feature)
        plt.ylabel(response)
        plt.show()

# find all significant variables against winnings
columns = list(data.columns)[1:]
columns.remove("Winnings")
for i in range(len(columns)):
    feature = columns[i]
    single_regression(columns[i], "Winnings")

In [None]:
# fit bivariate regression and output results if test R^2>=0.85
# features is a list of variables
def multiple_regression(features, response):
    lm = LinearRegression()
    x_train, x_test, y_train, y_test = train_test_split(data[features], data[[response]], train_size=0.8, test_size=0.2)
    lm.fit(x_train, y_train)
    y_predict = lm.predict(x_test)
    train_score = lm.score(x_train, y_train)
    test_score = lm.score(x_test, y_test)
    if test_score >= 0.85 and train_score >= 0.85:
        print(f"Training R^2 for {features} vs {response}: {train_score}")
        print(f"Test R^2 for {features} vs {response}: {test_score}")
        fig = plt.figure()
        ax = fig.add_subplot(projection='3d')
        ax.scatter(np.array(x_train.iloc[:,0]).reshape(-1, 1), np.array(x_train.iloc[:,1]).reshape(-1, 1), y_train)
        ax.scatter(np.array(x_test.iloc[:,0]).reshape(-1, 1), np.array(x_test.iloc[:,1]).reshape(-1, 1), y_test)
        ax.set_xlabel(features[0])
        ax.set_ylabel(features[1])
        ax.set_zlabel("Winnings")
        plt.show()

# find all significant pairs of variables against winnings
columns = list(data.columns)[1:]
columns.remove("Winnings")
#finding all combinations of 2 variables and removing duplicates
pairs_features = list(itertools.combinations(columns, r=2))
for i in range(len(pairs_features)):
    multiple_regression(list(pairs_features[i]), "Winnings")
