In [None]:
!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
!pip install --upgrade category_encoders

# 1-importing necessary libs





In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn import metrics

##1.1 Read Data with Pandas

In [None]:
bitrate_train_df = pd.read_csv('./bitrate_train.csv', low_memory=False)
bitrate_test_df = pd.read_csv('./bitrate_test.csv', low_memory=False)


## 1.2 Data exploration

In [None]:
bitrate_train_df.head(25)

Unnamed: 0,fps_mean,fps_std,rtt_mean,rtt_std,dropped_frames_mean,dropped_frames_std,dropped_frames_max,bitrate_mean,bitrate_std,target
0,24.4,0.516398,91.1,6.723921,0.0,0.0,0.0,460.5,7.648529,616.0
1,28.6,2.065591,99.7,15.923777,0.0,0.0,0.0,616.3,155.414893,565.0
2,30.0,0.0,98.1,11.798776,0.0,0.0,0.0,565.2,9.077445,573.0
3,30.3,0.948683,99.4,13.014522,0.0,0.0,0.0,573.8,28.350191,585.0
4,29.9,0.316228,123.2,62.476307,0.0,0.0,0.0,585.8,59.458295,555.0
5,29.5,1.649916,131.2,114.257798,0.0,0.0,0.0,555.2,47.713963,456.0
6,24.3,0.483046,98.3,16.499495,0.0,0.0,0.0,456.7,9.42868,511.0
7,24.5,0.971825,141.9,103.814418,0.0,0.0,0.0,511.4,156.318905,675.0
8,30.0,0.0,107.5,18.733511,0.0,0.0,0.0,675.5,88.334277,1129.0
9,30.0,0.471405,108.2,10.952422,0.0,0.0,0.0,1129.0,989.477079,560.0


In [None]:
bitrate_train_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fps_mean,379021.0,35.231127,10.97501,10.0,28.8,30.0,43.6,125.8
fps_std,379021.0,1.725705,2.505942,0.0,0.316228,0.942809,2.233582,307.167273
rtt_mean,379021.0,49.623858,94.781098,0.0,14.3,32.2,55.9,12898.4
rtt_std,379021.0,12.763672,112.68446,0.0,0.699206,1.433721,4.948625,40721.933293
dropped_frames_mean,379021.0,0.180451,1.73289,0.0,0.0,0.0,0.0,540.0
dropped_frames_std,379021.0,0.469548,3.157866,0.0,0.0,0.0,0.0,202.38577
dropped_frames_max,379021.0,1.450719,9.670928,0.0,0.0,0.0,0.0,640.0
bitrate_mean,379021.0,7516.585502,6073.992189,0.0,2773.3,6287.2,10187.2,64913.5
bitrate_std,379021.0,1603.487501,1721.021623,0.0,383.68355,1112.71001,2241.848801,26908.532303
target,379021.0,7525.396231,6070.817736,0.0,2785.0,6296.0,10192.0,64913.0


##1.3 checking missing data 

In [None]:
bitrate_train_df.isna().sum()

fps_mean               0
fps_std                0
rtt_mean               0
rtt_std                0
dropped_frames_mean    0
dropped_frames_std     0
dropped_frames_max     0
bitrate_mean           0
bitrate_std            0
target                 0
dtype: int64

##1.4  Profiling Data

In [None]:
from pandas_profiling import ProfileReport

report = ProfileReport(bitrate_train_df)
report.to_file('bitrate_train_report.html')

## 1.5 removing duplicate 

In [None]:
bitrate_train_df.drop_duplicates(inplace=True)

##1.5.1**outliers removal**

The common industry practice is to use 3 standard deviations away from the mean to differentiate outlier from non-outlier. 

By using 3 standard deviations we remove the 0.3% extreme cases.

In [None]:
from scipy import stats
bitrate_train_df_no_outliers = bitrate_train_df[(np.abs(stats.zscore(bitrate_train_df)) < 3).all(axis=1)].copy()
print(bitrate_train_df.shape)
print(bitrate_train_df_no_outliers.shape)

(375660, 10)
(349696, 10)


## 1.6 Features selection

In [None]:
X_train=bitrate_train_df[['bitrate_mean']]
y_train=bitrate_train_df[["target"]]

X_train_no_outliers = bitrate_train_df_no_outliers[['bitrate_mean']]
y_train_no_outliers = bitrate_train_df_no_outliers[["target"]]

## 1.7.1 ploting (with outliers)

In [None]:
plt.scatter(X_train.values, y_train.values)
plt.title('target vs bitrate_mean')
plt.xlabel('bitrate_mean')
plt.ylabel('target')
plt.show()

## 1.7.2 ploting (without outliers)

In [None]:
plt.scatter(X_train_no_outliers.values, y_train_no_outliers.values)
plt.title('target vs bitrate_mean')
plt.xlabel('bitrate_mean')
plt.ylabel('target')
plt.show()

##1.8 training simple linear regression model (with outliers)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

simple_linear = LinearRegression()
simple_linear.fit(X_train, y_train)
cross_val = cross_val_score(simple_linear, X_train, y_train, cv=3)
print(f"Model intercept : {simple_linear.intercept_}")
print(f"Model coefficients : {simple_linear.coef_}")
print(f"Cross validation score is : {cross_val}")


Model intercept : [431.69965255]
Model coefficients : [[0.94402994]]
Cross validation score is : [0.88867167 0.90468659 0.87533663]


##1.9 Model evaluation (with outliers)

In [None]:
X_test=bitrate_test_df[['bitrate_mean']]
y_test=bitrate_test_df[['target']]

y_pred = simple_linear.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))


Mean Absolute Error: 1079.2546842396653
Mean Squared Error: 3825594.571078191
Root Mean Squared Error: 1955.9127207209915


##1.10 training simple linear regression model (without outliers)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

simple_linear_no_outliers = LinearRegression()
simple_linear_no_outliers.fit(X_train_no_outliers, y_train_no_outliers)
cross_val = cross_val_score(simple_linear_no_outliers, X_train, y_train, cv=3)

print(f"Cross validation score is : {cross_val}")
print(f"Model intercept : {simple_linear_no_outliers.intercept_}")
print(f"Model coefficients : {simple_linear_no_outliers.coef_}")


Cross validation score is : [0.88867167 0.90468659 0.87533663]
Model intercept : [414.6235588]
Model coefficients : [[0.94356953]]


##1.11 Model evaluation (without outliers)

In [None]:
X_test=bitrate_test_df[['bitrate_mean']]
y_test=bitrate_test_df[['target']]

y_pred = simple_linear_no_outliers.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 1076.1822721948236
Mean Squared Error: 3825173.519452203
Root Mean Squared Error: 1955.8050821726083


## 1.12 training linear regression Model

In [None]:
X_train = bitrate_train_df.drop(['target', 'dropped_frames_max','fps_std','dropped_frames_std','dropped_frames_mean'], axis=1)

y_train = bitrate_train_df[["target"]]

X_train_no_outlier = bitrate_train_df_no_outliers.drop(['target', 'dropped_frames_max','fps_std','dropped_frames_std','dropped_frames_mean'], axis=1)

y_train_no_outlier = bitrate_train_df_no_outliers[["target"]]

X_test = bitrate_test_df.drop(['target', 'dropped_frames_max','fps_std','dropped_frames_std','dropped_frames_mean'], axis=1)
y_test = bitrate_test_df[['target']]

linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
print(f"Model intercept : {linear_reg.intercept_}")
print(f"Model coefficients : {linear_reg.coef_}")
cross_val = cross_val_score(linear_reg, X_train, y_train, cv=3)

print(f"Cross validation score is : {cross_val}")


linear_reg_no_outlier = LinearRegression()
linear_reg_no_outlier.fit(X_train_no_outlier, y_train_no_outlier)
print(f"Model intercept : {linear_reg.intercept_}")
print(f"Model coefficients : {linear_reg.coef_}")

cross_val = cross_val_score(linear_reg_no_outlier, X_train, y_train, cv=3)

print(f"Cross validation score is : {cross_val}")

Model intercept : [343.61364818]
Model coefficients : [[ 1.86533611 -0.5758185   0.10565859  0.92439003  0.12234561]]
Cross validation score is : [0.88949705 0.90576686 0.87617918]
Model intercept : [343.61364818]
Model coefficients : [[ 1.86533611 -0.5758185   0.10565859  0.92439003  0.12234561]]
Cross validation score is : [0.88949705 0.90576686 0.87617918]


## 1.13 evalute linear regression model

In [None]:
y_pred = linear_reg.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

y_pred_no_outlier = linear_reg_no_outlier.predict(X_test)

print('Mean Absolute whithout outliers:', metrics.mean_absolute_error(y_test, y_pred_no_outlier))
print('Mean Squared Error whithout outliers:', metrics.mean_squared_error(y_test, y_pred_no_outlier))
print('Root Mean Squared Error whithout outliers:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_no_outlier)))

Mean Absolute Error: 1078.0533497487836
Mean Squared Error: 3798894.311092989
Root Mean Squared Error: 1949.075245108046
Mean Absolute whithout outliers: 1076.8090334283263
Mean Squared Error whithout outliers: 3808960.704102732
Root Mean Squared Error whithout outliers: 1951.6558877278371


##1.14 polynomial Regression

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures #to convert the original features into their higher order terms 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

## 1.15 training and evaluating polynomial regression (with ouliers)

In [None]:
degrees = [2, 3, 4, 5]

for i in range(len(degrees)):

    polynomial_features = PolynomialFeatures(degree=degrees[i])
    linear_regression = LinearRegression()
    pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression", linear_regression)])
    pipeline.fit(X_train, y_train)

   
    y_pred = pipeline.predict(X_test)
    cross_val = cross_val_score(linear_regression, X_train, y_train, cv=3)

    print(f"Cross validation score is : {cross_val}")

    print(f"Degree: {degrees[i]}")
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Cross validation score is : [0.88949705 0.90576686 0.87617918]
Degree: 2
Mean Absolute Error: 1055.4546276083336
Mean Squared Error: 3770454.7551768064
Root Mean Squared Error: 1941.7658857794381
Cross validation score is : [0.88949705 0.90576686 0.87617918]
Degree: 3
Mean Absolute Error: 1052.0772265072176
Mean Squared Error: 3776976.767913053
Root Mean Squared Error: 1943.4445626034853
Cross validation score is : [0.88949705 0.90576686 0.87617918]
Degree: 4
Mean Absolute Error: 1125.3144022339193
Mean Squared Error: 4978780.643634978
Root Mean Squared Error: 2231.318140390334
Cross validation score is : [0.88949705 0.90576686 0.87617918]
Degree: 5
Mean Absolute Error: 1841.3016186124748
Mean Squared Error: 73796512.84951514
Root Mean Squared Error: 8590.489674606166


## 1.16 training and evaluating polynomial regression (without ouliers)

In [None]:
degrees = [2, 3, 4, 5]

plt.figure(figsize=(14, 5))
for i in range(len(degrees)):

    polynomial_features = PolynomialFeatures(degree=degrees[i])
    linear_regression = LinearRegression()
    pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression", linear_regression)])
    pipeline.fit(X_train_no_outlier, y_train_no_outlier)
    y_pred = pipeline.predict(X_test)

    cross_val = cross_val_score(linear_regression, X_train, y_train, cv=3)
    print(f"Degree: {degrees[i]}")


    print(f"Cross validation score is : {cross_val}")
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Degree: 2
Cross validation score is : [0.88949705 0.90576686 0.87617918]
Mean Absolute Error: 1083.7078379456561
Mean Squared Error: 4971313.44671029
Root Mean Squared Error: 2229.6442421853517
Degree: 3
Cross validation score is : [0.88949705 0.90576686 0.87617918]
Mean Absolute Error: 2085.215388817448
Mean Squared Error: 2284479435.5794277
Root Mean Squared Error: 47796.22825683453
Degree: 4
Cross validation score is : [0.88949705 0.90576686 0.87617918]
Mean Absolute Error: 28706.916566334567
Mean Squared Error: 3008886690409.9556
Root Mean Squared Error: 1734614.277126173
Degree: 5
Cross validation score is : [0.88949705 0.90576686 0.87617918]
Mean Absolute Error: 1092145.0756675547
Mean Squared Error: 1.8265592232850068e+16
Root Mean Squared Error: 135150257.98292089


<Figure size 1008x360 with 0 Axes>

##1.17 multi linear regression with regularization 

In [None]:
from sklearn.linear_model import Lasso

X_train = bitrate_train_df.drop(['target','rtt_mean','rtt_std','dropped_frames_mean','dropped_frames_max','dropped_frames_std'], axis=1)

y_train=bitrate_train_df[["target"]]

X_test = bitrate_test_df.drop(['target','rtt_mean','rtt_std','dropped_frames_mean','dropped_frames_max','dropped_frames_std'], axis=1)

y_test=bitrate_test_df[['target']]

X_train, x_val, y_train, y_val = train_test_split(X_train, y_train, test_size=1/8, random_state=45)

lasso = Lasso()
lasso.fit(X_train, y_train)

print('Lasso coef ', lasso.coef_)

Lasso coef  [ 1.84905225 -1.27931681  0.92477999  0.12576433]


In [None]:
from sklearn.metrics import mean_squared_error

alphas = [0.1,0.5,0.9,1,1.4,2.2,2.5,3]
losses = []
for alpha in alphas:
    model = Lasso(alpha = alpha).fit(X_train, y_train)
    mse = mean_squared_error(model.predict(x_val),y_val)
    losses.append(mse)

best_alpha = alphas[np.argmin(losses)]
print("Best value of alpha:", best_alpha)

Best value of alpha: 0.1


In [None]:
lasso = Lasso(best_alpha)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)
print("MSE on test set:", mean_squared_error(y_test, y_pred))

MSE on test set: 3801448.2796545415


# 2  Logistic Regression

##2.1 reading and visualization 

In [None]:
#!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

In [None]:
import pandas as pd
import numpy as np
import sklearn

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train_data_df = pd.read_csv('./train_data.csv')
test_data_df = pd.read_csv('./test_data.csv')
train_data_df.drop_duplicates(inplace=True)
train_data_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fps_mean,373142.0,35.069812,11.39201,0.0,28.2,30.0,44.1,127.1
fps_std,373142.0,2.489203,3.805238,0.0,0.421637,1.229273,2.836273,312.5408
fps_lags,373142.0,0.09737,0.5675634,0.0,0.0,0.0,0.0,10.0
rtt_mean,373142.0,57.281921,135.6887,0.0,16.0,34.0,59.2,12898.4
rtt_std,373142.0,21.229666,163.1101,0.0,0.788811,1.712698,6.196773,40721.93
dropped_frames_mean,373142.0,346235.005845,20301920.0,0.0,0.0,0.0,0.0,2097289000.0
dropped_frames_std,373142.0,149161.674525,9614595.0,0.0,0.0,0.0,0.0,996375100.0
dropped_frames_max,373142.0,522273.665476,27490800.0,0.0,0.0,0.0,0.0,2097289000.0
auto_fec_mean,373142.0,51.952779,35.5068,0.0,50.0,50.0,50.0,250.0
stream_quality,373142.0,0.063927,0.2446239,0.0,0.0,0.0,0.0,1.0


In [None]:
train_data_df.head(25)

Unnamed: 0,fps_mean,fps_std,fps_lags,rtt_mean,rtt_std,dropped_frames_mean,dropped_frames_std,dropped_frames_max,auto_bitrate_state,auto_fec_state,auto_fec_mean,stream_quality
0,24.4,0.516398,0,91.1,6.723921,0.0,0.0,0.0,off,partial,50.0,0
1,28.6,2.065591,0,99.7,15.923777,0.0,0.0,0.0,off,partial,50.0,0
2,30.0,0.0,0,98.1,11.798776,0.0,0.0,0.0,off,partial,50.0,0
3,30.3,0.948683,0,99.4,13.014522,0.0,0.0,0.0,off,partial,50.0,0
4,29.9,0.316228,0,123.2,62.476307,0.0,0.0,0.0,off,partial,50.0,0
5,29.5,1.649916,0,131.2,114.257798,0.0,0.0,0.0,off,partial,50.0,0
6,24.3,0.483046,0,98.3,16.499495,0.0,0.0,0.0,off,partial,50.0,0
7,24.5,0.971825,0,141.9,103.814418,0.0,0.0,0.0,off,partial,50.0,0
8,30.0,0.0,0,107.5,18.733511,0.0,0.0,0.0,off,partial,50.0,0
9,30.0,0.471405,0,108.2,10.952422,0.0,0.0,0.0,off,partial,50.0,0


##2.2 profiling data

In [None]:
from pandas_profiling import ProfileReport
report = ProfileReport(train_data_df)
report.to_file('train_data_report.html')

##2.3 balancing data

In [None]:
train_data_df_0 = train_data_df[train_data_df['stream_quality'] == 0]
train_data_df_1 = train_data_df[train_data_df['stream_quality'] == 1]


print(len(train_data_df_0.index))
print(len(train_data_df_1.index))
train_data_df_1_over = train_data_df_1.sample(len(train_data_df_0.index), replace=True)#up-sample
print(len(train_data_df_1_over.index))

train_data_df = pd.concat([train_data_df_0, train_data_df_1_over], axis=0)


349288
23854
349288


##2.4 Feature Selection

In [None]:
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
 
x_train=train_data_df.drop(['fps_mean','rtt_mean','rtt_std','auto_fec_mean'], axis = 1)

x_test=test_data_df.drop(['fps_mean','rtt_mean','rtt_std','auto_fec_mean'], axis = 1)


##2.5 Principal Component Analysis 

In [None]:
import plotly.express as px
from sklearn.decomposition import PCA

x_train_1=x_train[['fps_std', 'fps_lags', 'dropped_frames_mean', 'dropped_frames_std','dropped_frames_max']]
y_train_1=train_data_df['stream_quality']

pca = PCA(n_components=2)
components = pca.fit_transform(x_train_1,y_train_1)

fig = px.scatter(components, x=0, y=1)
fig.show()


print('Mean : ',pca.mean_)
print('explained variance : ',pca.explained_variance_)
print('explained variance ratio: ',pca.explained_variance_ratio_)

##2.6 feature encoding

In [None]:
from category_encoders import OneHotEncoder

# encode categorical features
features_to_encode = ['auto_bitrate_state','auto_fec_state']
encoder = OneHotEncoder(cols=features_to_encode)

encoder.fit(x_train)

x_train_enc = encoder.transform(x_train)
x_test_enc = encoder.transform(x_test)

#print(x_train_enc)


##2.7 Feature scaling

In [42]:
# feature scaling using MinMaxScaler
scaler = MinMaxScaler().fit(x_train_enc)
x_train = pd.DataFrame(scaler.transform(x_train_enc), columns=x_train_enc.columns)
scaler = MinMaxScaler().fit(x_test_enc)
x_test = pd.DataFrame(scaler.transform(x_test_enc), columns=x_test_enc.columns)



##2.8 fiting and evalueting LR

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics


y_train=x_train[['stream_quality']]
y_test=x_test[['stream_quality']]

x_train=x_train.drop(['stream_quality'], axis = 1)
x_test=x_test.drop(['stream_quality'], axis = 1)

# fit Logistic Regression
logisticRegression = LogisticRegression(random_state=0).fit(x_train, y_train)
y_test_pred = logisticRegression.predict(x_test)

# calculate metrics
print('Testing accuracy = {}'.format(metrics.accuracy_score(y_test, y_test_pred)))
print('Testing precision = {}'.format(metrics.precision_score(y_test, y_test_pred)))
print('Testing recall = {}'.format(metrics.recall_score(y_test, y_test_pred)))
print('Testing f1_score = {}'.format(metrics.f1_score(y_test, y_test_pred)))




A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



Testing accuracy = 0.7868355802229922
Testing precision = 0.17228654124457307
Testing recall = 0.6068561233592455
Testing f1_score = 0.2683799701299067


##2.9 calculating matrices for different threshold

In [None]:
#calculate metrics for each threshold above and plot the result as below.
thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
pred_proba = logisticRegression.predict_proba(x_test)

results = [[],[],[],[]]
for i in thresholds:
    y_test_pred_thr = np.where(pred_proba[:, 1] > i, 1, 0)
    results[0].append(metrics.accuracy_score(y_test, y_test_pred_thr))
    results[1].append(metrics.precision_score(y_test, y_test_pred_thr))
    results[2].append(metrics.recall_score(y_test, y_test_pred_thr))
    results[3].append(metrics.f1_score(y_test, y_test_pred_thr))

plt.plot(thresholds, results[0], label = 'accuracy')   
plt.plot(thresholds, results[1], label = 'precision')   
plt.plot(thresholds, results[2], label = 'recall')
plt.plot(thresholds, results[3], label = 'f1_score')

plt.title('Threshold Selection')
plt.xlabel('threshold')
plt.ylabel('score')
plt.legend()
plt.grid()

##2.10 Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

print('sklearn Confusion Matrix :\n', confusion_matrix(y_test, y_test_pred))
plot_confusion_matrix(logisticRegression, x_test, y_test)  
plt.show()