## Packages

In [22]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from joblib import dump, load
from scipy.stats import chi2_contingency

## Load the dataset

In [23]:
pd.set_option('display.max_columns', None)  # None means unlimited

# Load the dataset
df = pd.read_csv('dataset.csv')

## Get the firstkill and round winner

In [None]:
# Load the dataset
df = pd.read_csv('dataset.csv')

# Initialize variables
previous_scores = (-1, -1)  # To hold previous row's scores
rounds_data = []  # To hold the processed data for each round

# Iterate through the DataFrame
for index, row in df.iterrows():
    current_scores = (row['ct_score'], row['t_score'])

    # Check if it's a new round
    if current_scores != previous_scores:
        # Only process if t_firstkill is not None (meaning a first kill was recorded)
        if index != 0 and t_firstkill is not None:
            # Get the round_winner from the last row of the previous round
            round_winner = df.iloc[index - 1]['round_winner']
            # Append data to rounds_data
            rounds_data.append({'round_winner': round_winner, 't_firstkill': t_firstkill})
            # print(f"We found the index {index+1} is the T first kill is {t_firstkill==1} and round winner {round_winner}")

        # Reset t_firstkill for the new round
        t_firstkill = None
        previous_scores = current_scores

    # Check for first kill condition within the round
    if t_firstkill is None and (row['ct_players_alive'], row['t_players_alive']) in [(4, 5), (5, 4)]:
        # Determine t_firstkill based on players alive
        t_firstkill = 0 if row['ct_players_alive'] == 5 else 1

# Create new DataFrame
rounds_df = pd.DataFrame(rounds_data)

# Display the new DataFrame
rounds_df

Unnamed: 0,round_winner,t_firstkill
0,CT,0
1,CT,1
2,T,1
3,T,0
4,CT,0
...,...,...
7168,CT,0
7169,T,1
7170,CT,0
7171,T,0


## Test if the first kills matters

In [None]:
# Count the number of rounds where terrorists got the first kill and won
t_firstkill_and_win = len(rounds_df[(rounds_df['t_firstkill'] == 1) & (rounds_df['round_winner'] == 'T')])

# Count the number of rounds where terrorists got the first kill
t_firstkill_total = len(rounds_df[rounds_df['t_firstkill'] == 1])

# Calculate the probability of T winning given they got the first kill
prob_t_win_given_firstkill = t_firstkill_and_win / t_firstkill_total if t_firstkill_total > 0 else 0

# Count the number of rounds where terrorists didn't get the first kill but still won
t_not_firstkill_and_win = len(rounds_df[(rounds_df['t_firstkill'] == 0) & (rounds_df['round_winner'] == 'T')])

# Count the number of rounds where terrorists didn't get the first kill
t_not_firstkill_total = len(rounds_df[rounds_df['t_firstkill'] == 0])

# Calculate the probability of T winning given they did not get the first kill
prob_t_win_given_not_firstkill = t_not_firstkill_and_win / t_not_firstkill_total if t_not_firstkill_total > 0 else 0

# Output the results
print("Probability of Terrorists winning given they got the first kill:", prob_t_win_given_firstkill)
print("Probability of Terrorists winning given they did not get the first kill:", prob_t_win_given_not_firstkill)

Probability of Terrorists winning given they got the first kill: 0.7450442207990241
Probability of Terrorists winning given they did not get the first kill: 0.29583975346687214


In [None]:
# Count the number of rounds where CT got the first kill and won
ct_firstkill_and_win = len(rounds_df[(rounds_df['t_firstkill'] == 0) & (rounds_df['round_winner'] == 'CT')])

# Count the number of rounds where CT got the first kill
ct_firstkill_total = len(rounds_df[rounds_df['t_firstkill'] == 0])

# Calculate the probability of CT winning given they got the first kill
prob_ct_win_given_firstkill = ct_firstkill_and_win / ct_firstkill_total if ct_firstkill_total > 0 else 0

# Count the number of rounds where CT didn't get the first kill but still won
ct_not_firstkill_and_win = len(rounds_df[(rounds_df['t_firstkill'] == 1) & (rounds_df['round_winner'] == 'CT')])

# Count the number of rounds where CT didn't get the first kill
ct_not_firstkill_total = len(rounds_df[rounds_df['t_firstkill'] == 1])

# Calculate the probability of CT winning given they did not get the first kill
prob_ct_win_given_not_firstkill = ct_not_firstkill_and_win / ct_not_firstkill_total if ct_not_firstkill_total > 0 else 0

# Output the results
print("Probability of Counter-Terrorists winning given they got the first kill:", prob_ct_win_given_firstkill)
print("Probability of Counter-Terrorists winning given they did not get the first kill:", prob_ct_win_given_not_firstkill)

Probability of Counter-Terrorists winning given they got the first kill: 0.7041602465331279
Probability of Counter-Terrorists winning given they did not get the first kill: 0.2549557792009759


In this case, our null hypothesis (H0) is that there's no difference in the winning probabilities for terrorists and counter-terrorists when they get the first kill. The alternative hypothesis (H1) is that there is a significant difference.

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency
import numpy as np

# Assuming rounds_df is your DataFrame from the previous steps

# Calculate the counts
t_firstkill_and_win = len(rounds_df[(rounds_df['t_firstkill'] == 1) & (rounds_df['round_winner'] == 'T')])
t_firstkill_and_lose = len(rounds_df[(rounds_df['t_firstkill'] == 1) & (rounds_df['round_winner'] == 'CT')])
ct_firstkill_and_win = len(rounds_df[(rounds_df['t_firstkill'] == 0) & (rounds_df['round_winner'] == 'CT')])
ct_firstkill_and_lose = len(rounds_df[(rounds_df['t_firstkill'] == 0) & (rounds_df['round_winner'] == 'T')])

# Constructing the contingency table
contingency_table = np.array([[t_firstkill_and_win, t_firstkill_and_lose],
                              [ct_firstkill_and_win, ct_firstkill_and_lose]])

# Performing the chi-squared test
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Interpreting the result
alpha = 0.05  # significance level
print("Chi-Squared: ", chi2)
print("P-value: ", p)

if p < alpha:
    print("There is a significant difference in winning probabilities (reject H0)")
else:
    print("There is no significant difference in winning probabilities (fail to reject H0)")

NameError: name 't_firstkill_and_lose' is not defined

## Peek at the dataset

In [None]:
df.head()

Unnamed: 0,time_left,ct_score,t_score,map,bomb_planted,ct_health,t_health,ct_armor,t_armor,ct_money,t_money,ct_helmets,t_helmets,ct_defuse_kits,ct_players_alive,t_players_alive,ct_weapon_ak47,t_weapon_ak47,ct_weapon_aug,t_weapon_aug,ct_weapon_awp,t_weapon_awp,ct_weapon_bizon,t_weapon_bizon,ct_weapon_cz75auto,t_weapon_cz75auto,ct_weapon_elite,t_weapon_elite,ct_weapon_famas,t_weapon_famas,ct_weapon_g3sg1,t_weapon_g3sg1,ct_weapon_galilar,t_weapon_galilar,ct_weapon_glock,t_weapon_glock,ct_weapon_m249,t_weapon_m249,ct_weapon_m4a1s,t_weapon_m4a1s,ct_weapon_m4a4,t_weapon_m4a4,ct_weapon_mac10,t_weapon_mac10,ct_weapon_mag7,t_weapon_mag7,ct_weapon_mp5sd,t_weapon_mp5sd,ct_weapon_mp7,t_weapon_mp7,ct_weapon_mp9,t_weapon_mp9,ct_weapon_negev,t_weapon_negev,ct_weapon_nova,t_weapon_nova,ct_weapon_p90,t_weapon_p90,ct_weapon_r8revolver,t_weapon_r8revolver,ct_weapon_sawedoff,t_weapon_sawedoff,ct_weapon_scar20,t_weapon_scar20,ct_weapon_sg553,t_weapon_sg553,ct_weapon_ssg08,t_weapon_ssg08,ct_weapon_ump45,t_weapon_ump45,ct_weapon_xm1014,t_weapon_xm1014,ct_weapon_deagle,t_weapon_deagle,ct_weapon_fiveseven,t_weapon_fiveseven,ct_weapon_usps,t_weapon_usps,ct_weapon_p250,t_weapon_p250,ct_weapon_p2000,t_weapon_p2000,ct_weapon_tec9,t_weapon_tec9,ct_grenade_hegrenade,t_grenade_hegrenade,ct_grenade_flashbang,t_grenade_flashbang,ct_grenade_smokegrenade,t_grenade_smokegrenade,ct_grenade_incendiarygrenade,t_grenade_incendiarygrenade,ct_grenade_molotovgrenade,t_grenade_molotovgrenade,ct_grenade_decoygrenade,t_grenade_decoygrenade,round_winner
0,175.0,0.0,0.0,de_dust2,False,500.0,500.0,0.0,0.0,4000.0,4000.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CT
1,156.03,0.0,0.0,de_dust2,False,500.0,500.0,400.0,300.0,600.0,650.0,0.0,0.0,1.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,CT
2,96.03,0.0,0.0,de_dust2,False,391.0,400.0,294.0,200.0,750.0,500.0,0.0,0.0,1.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,CT
3,76.03,0.0,0.0,de_dust2,False,391.0,400.0,294.0,200.0,750.0,500.0,0.0,0.0,1.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CT
4,174.97,1.0,0.0,de_dust2,False,500.0,500.0,192.0,0.0,18350.0,10750.0,0.0,0.0,1.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CT


## Create dummy variables

In [None]:
df = pd.get_dummies(df, columns=['map', 'bomb_planted', 'round_winner'], drop_first=True)
df.head()

Unnamed: 0,time_left,ct_score,t_score,ct_health,t_health,ct_armor,t_armor,ct_money,t_money,ct_helmets,t_helmets,ct_defuse_kits,ct_players_alive,t_players_alive,ct_weapon_ak47,t_weapon_ak47,ct_weapon_aug,t_weapon_aug,ct_weapon_awp,t_weapon_awp,ct_weapon_bizon,t_weapon_bizon,ct_weapon_cz75auto,t_weapon_cz75auto,ct_weapon_elite,t_weapon_elite,ct_weapon_famas,t_weapon_famas,ct_weapon_g3sg1,t_weapon_g3sg1,ct_weapon_galilar,t_weapon_galilar,ct_weapon_glock,t_weapon_glock,ct_weapon_m249,t_weapon_m249,ct_weapon_m4a1s,t_weapon_m4a1s,ct_weapon_m4a4,t_weapon_m4a4,ct_weapon_mac10,t_weapon_mac10,ct_weapon_mag7,t_weapon_mag7,ct_weapon_mp5sd,t_weapon_mp5sd,ct_weapon_mp7,t_weapon_mp7,ct_weapon_mp9,t_weapon_mp9,ct_weapon_negev,t_weapon_negev,ct_weapon_nova,t_weapon_nova,ct_weapon_p90,t_weapon_p90,ct_weapon_r8revolver,t_weapon_r8revolver,ct_weapon_sawedoff,t_weapon_sawedoff,ct_weapon_scar20,t_weapon_scar20,ct_weapon_sg553,t_weapon_sg553,ct_weapon_ssg08,t_weapon_ssg08,ct_weapon_ump45,t_weapon_ump45,ct_weapon_xm1014,t_weapon_xm1014,ct_weapon_deagle,t_weapon_deagle,ct_weapon_fiveseven,t_weapon_fiveseven,ct_weapon_usps,t_weapon_usps,ct_weapon_p250,t_weapon_p250,ct_weapon_p2000,t_weapon_p2000,ct_weapon_tec9,t_weapon_tec9,ct_grenade_hegrenade,t_grenade_hegrenade,ct_grenade_flashbang,t_grenade_flashbang,ct_grenade_smokegrenade,t_grenade_smokegrenade,ct_grenade_incendiarygrenade,t_grenade_incendiarygrenade,ct_grenade_molotovgrenade,t_grenade_molotovgrenade,ct_grenade_decoygrenade,t_grenade_decoygrenade,map_de_dust2,map_de_inferno,map_de_mirage,map_de_nuke,map_de_overpass,map_de_train,map_de_vertigo,bomb_planted_True,round_winner_T
0,175.0,0.0,0.0,500.0,500.0,0.0,0.0,4000.0,4000.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,False,False,False,False,False
1,156.03,0.0,0.0,500.0,500.0,400.0,300.0,600.0,650.0,0.0,0.0,1.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,False,False,False,False,False
2,96.03,0.0,0.0,391.0,400.0,294.0,200.0,750.0,500.0,0.0,0.0,1.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,False,False,False,False,False
3,76.03,0.0,0.0,391.0,400.0,294.0,200.0,750.0,500.0,0.0,0.0,1.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,False,False,False,False,False
4,174.97,1.0,0.0,500.0,500.0,192.0,0.0,18350.0,10750.0,0.0,0.0,1.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,False,False,False,False,False


# Logistic Regression

## Train Test Split

In [None]:
X = df.drop('round_winner_T', axis=1)
y = df['round_winner_T']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Fit logistic model

In [None]:
# Number of iterations for the logistic regression model
num_iterations = 10000
model_filename = f'logreg_model_{num_iterations}.joblib'

# Check if the model file exists
if os.path.exists(model_filename):
    # Load the existing model
    logreg = load(model_filename)
else:
    # Train the model if it doesn't exist
    logreg = LogisticRegression(max_iter=num_iterations)
    logreg.fit(X_train, y_train)
    
    # Save the trained model
    dump(logreg, model_filename)

# Making predictions using the model
y_pred = logreg.predict(X_test)

# Evaluating the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.7429131606894862

Classification Report:
               precision    recall  f1-score   support

       False       0.73      0.76      0.74     12004
        True       0.76      0.72      0.74     12478

    accuracy                           0.74     24482
   macro avg       0.74      0.74      0.74     24482
weighted avg       0.74      0.74      0.74     24482


Confusion Matrix:
 [[9142 2862]
 [3432 9046]]
