<a href="https://colab.research.google.com/github/Mjcherono/IP-Week-6-FifaRanking/blob/main/FifaRankingResults3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Importing Libraries and Loading the Datasets

In [None]:
import numpy as np
import pandas as pd

In [None]:
ranking = pd.read_csv('/content/fifa_ranking.csv')
results = pd.read_csv('/content/results.csv')

In [None]:
#checking on head
ranking.head()

In [None]:
results.head()

In [None]:
# Previewing the bottom of our dataset
print(ranking.tail())
print(results.tail())


In [None]:
# Determining the no. of records in our dataset
print(ranking.shape)
print(results.shape)

In [None]:
# Checking whether each column has an appropriate datatype
print(results.dtypes)
print(ranking.dtypes)

In [None]:
#summary of datasets
print(ranking.describe())
print(results.describe())

In [None]:
#changing date to datetime

results['date'] = pd.to_datetime(results['date'])
ranking['rank_date'] = pd.to_datetime(ranking['rank_date'])

#splitting year and month on dates
results['year'] = results.date.dt.year
results['month'] = results.date.dt.month

ranking['year'] = ranking.rank_date.dt.year
ranking['month'] = ranking.rank_date.dt.month

In [None]:
#merging the datasets
final_results = pd.merge(results, ranking, how = 'left', left_on = ['year', 'month'], right_on = ['year', 'month'])

In [None]:
final_results

###Data Preprocessing

In [None]:
#checking for null values
final_results.isnull().sum()

#dropping null values
final_results.dropna()



In [None]:
final_results.shape

In [None]:

#Checking out for Duplicates
final_results.duplicated().sum()

final_results.drop_duplicates(inplace= True)

In [None]:
final_results.columns

In [None]:
#dropping irrelevant columns

final_results = final_results[['rank','country_full','home_team','away_team','home_score','away_score','tournament','year','month']]
final_results

In [None]:
#creating a function to determine the status of a game
def status_hometeam(home_score,away_score):
  if home_score > away_score:
    return 'Win'
  elif home_score < away_score:
    return 'Lose'
  else:
    return 'Draw'

#creating status column
final_results['status']=final_results.apply(lambda x: status_hometeam(x['home_score'],x['away_score']), axis=1)

In [None]:
final_results.head()

###Exploratory Data Analysis

####Univariate Analysis

#####Distribution of home scores

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.distplot(final_results.home_score , kde=True)
plt.title('Histogram of Home score Distribution')
plt.show()

In [None]:
#line plot fr home score

final_results['home_score'].value_counts().sort_index().plot.line()


#####Distibution of Away scores

In [None]:
sns.distplot(final_results.away_score , kde=True)
plt.title('Histogram of Away score Distribution')
plt.show()

In [None]:
#label encode the categorical columns

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
final_results['tournament'] = le.fit_transform(final_results['tournament'])

####Binary Analysis

In [None]:
##Pairplot to heck for corelation
#sns.pairplot(final_results)

In [None]:
#heat map for correlation purposes
plt.figure(figsize=(8, 8))

fr_corr = final_results.corr()
sns.heatmap(fr_corr, 
            xticklabels = fr_corr.columns.values,
            yticklabels = fr_corr.columns.values,
            annot = True);

In [None]:
#Boxplot representation of home scores over the years

sns.boxplot(x="home_score", y="year", data=final_results)


In [None]:
#Boxplot representation of away scores over the years

sns.boxplot(x="away_score", y="year", data=final_results)


In [None]:
final_results.columns

In [None]:
final_results['rank'].nunique()

In [None]:
final_results['home_team'].nunique()

In [None]:
final_results['away_team'].nunique()

In [None]:
final_results['home_score'].nunique()

In [None]:
final_results['away_score'].nunique()

In [None]:
final_results['tournament'].nunique()

In [None]:

final_results['status'].nunique()

#####Average goals per year

In [None]:
#Home scores

home_goals = final_results.groupby('year')['home_score'].mean().reindex()
sns.lineplot(data = home_goals )


In [None]:
#Away scores

away_goals = final_results.groupby('year')['away_score'].mean().reindex()
sns.lineplot(data = away_goals )


#####Distribution of ranks over the number of scores

In [None]:
final_results.plot.scatter(x='home_score', y='rank')


###Polynomial Regression

In [None]:
final_results.head(5)

In [None]:
#dropping country full and month

final_results.drop(['country_full','month'],inplace=True,axis=1)

In [None]:
final_results.drop(['year'],inplace=True,axis=1)

In [None]:
final_results.head()

####Detecting multicollinearity

In [None]:
#We'll check for collinearity in independent variables

correlations = final_results.drop(['home_score'], axis=1)
correlations.corr()

In [None]:
#Computing VIF Scores

#pd.DataFrame(np.linalg.inv(correlations.values), index = correlations.index, columns = correlations.columns)


In [None]:
final_results.head()

####Model 1: Predict how many goals the home team scores.

In [None]:
#creating and training our polynomial model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

X = final_results[['rank','tournament']]
y = final_results['home_score']

#split to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 25)

#fit a polynomial regression
poly = PolynomialFeatures(degree = 6)
poly.fit_transform(X_train)

#training our model
poly_reg = LinearRegression()
poly_reg.fit(X_train,y_train)

#making predictions
y_pred = poly_reg.predict(X_test)
y_pred

#using rmse to measure the accuracy
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
final_results.describe()

####Model 2: Predict how many goals the away team scores.



In [None]:
#creating and training our polynomial model
#for the second model we use away scores

X = final_results[['rank','tournament']]
y = final_results['away_score']

#split to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 25)

#fit a polynomial regression
poly = PolynomialFeatures(degree = 2)
poly.fit_transform(X_train)

#training our model
poly_reg = LinearRegression()
poly_reg.fit(X_train,y_train)

#making predictions
y_pred = poly_reg.predict(X_test)
y_pred

#using rmse to measure the accuracy
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

#####The lower the value of RMSE the better the model.Hence this model's performance is fairly good.

####Using Residual Plots to check on the accuracy of the model

In [None]:
#Residual = test value - predicted value

residuals = np.subtract(y_pred, y_test)

#describe our residual:
print(pd.DataFrame(residuals).describe())

print(residuals.mean())

######Our residual mean is close to 0 meaning our prediction is fairly correct, though slightly overestimating chances by close to ; 0.13%

######Residual plot

In [None]:
plt.scatter(y_pred, residuals, color='black')
plt.ylabel('residual')
plt.xlabel('fitted values')
plt.axhline(y= residuals.mean(), color='red', linewidth=1)
plt.show()

######Our residuals are centered arount the 0 mean

####Heteroskedasticity Test

###### The test establishes a null hypothesis that the variance is equal for all our data points and the alternative hypothesis that the variance is different for atleast one pair of datapoints.

In [None]:
#finding the p value
import scipy as sp

test_result, p_value = sp.stats.bartlett(y_pred, residuals)

#finding the critical value of the chi squared distribution
degree_of_freedom = len(y_pred)-1
probability = 1 - p_value

critical_value = sp.stats.chi2.ppf(probability, degree_of_freedom)
print(critical_value)

#if test_result > critical_value we reject the null hypothesis
#
if (test_result > critical_value):
  print('the variances are unequal, and the model should be reassessed')
else:
  print('the variances are homogeneous!')

###Logistic Regression

In [None]:
#dataset
final_results.head()

In [None]:
#Checking that our target variable is binary

sns.countplot(x='status',data=final_results, palette='hls')


In [None]:
#Converting our categorical variable to dummy indicators
#final_results['status']  = pd.get_dummies(final_results['status'],drop_first=False)
#final_results

In [None]:
final_results.drop(['home_team','away_team'],inplace=True , axis=1)


In [None]:
final_results

In [72]:
#splitting the data into features and target

X = final_results.drop("status",axis=1)
y = final_results["status"]

#
X = pd.get_dummies(X, drop_first=True)


#splitting the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=20)

#creating an object of the model
from sklearn.linear_model import  LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

#making y predictions
y_pred = logreg.predict(X_test)

#evaluating the model using a confusion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
cm

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


array([[141586,      0,      0],
       [     0, 164244,      0],
       [     0,      0, 294656]])

####Hyperparameter tuning for logistic regression

#####using gridsearch cv

In [None]:
# Creating regularization penalty space
penalty = ['l1', 'l2']

# Creating regularization hyperparameter space
C = np.logspace(0, 5, 10)

# Creating hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

#grid search using 5-fold cross validation
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(logreg, hyperparameters, cv=5, verbose=0)

#fitting
best_model = clf.fit(X, y)

#checking on the hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative so

In [74]:
# Predicting target vector
best_model.predict(X)

NameError: ignored

###Conclusion

#####Logistic regression as a model performed well in the prediction of which team won,lost or had a draw.

######The dataset provided for rankings had no data before 1993, therefore was a little insufficient in training the model for better performance.