In [3]:
# Import necessary libraries
import pandas as pd
import os

# Specify the folder containing the CSV files
folder_path = r"C:\Users\suhan pahalage\Downloads\CASE2\CASE2\WorldCup_Stats"  # Replace with your folder path

# Get a list of all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Initialize an empty list to store DataFrames
dataframes = []

# Loop through each file and load it into a DataFrame
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
crick_df = pd.concat(dataframes, ignore_index=True)

# Display the first few rows of the combined DataFrame
print(crick_df.head())


   Unnamed: 0.1  Unnamed: 0        date        venue match_category team_1  \
0             0          11         NaN   Nottingham   League-Match    PAK   
1             1           5         NaN        Leeds   League-Match    EAf   
2             2          12  1975-06-18        Leeds     Semi-Final    ENG   
3             3           8  1975-06-14   Birmingham   League-Match    ENG   
4             4          13         NaN     The Oval     Semi-Final     NZ   

  team_2  team_1_runs  team_1_wickets  team_2_runs  team_2_wickets  \
0     SL        330.0             6.0        138.0             0.0   
1    IND        120.0             0.0        123.0             0.0   
2    AUS         93.0             0.0         94.0             6.0   
3    EAf        290.0             5.0         94.0             0.0   
4     WI        158.0             0.0        159.0             5.0   

                                              result                 pom  \
0                           Pakist

In [4]:
# Examine the structure of the DataFrame
print("DataFrame Info:")
print(crick_df.info())  # Provides an overview of features, data types, and non-null counts

print("\nDataFrame Description:")
print(crick_df.describe(include='all'))  # Summary statistics for numeric and non-numeric columns

print("\nFirst few rows:")
print(crick_df.head())  # Preview the first few rows of the DataFrame

# Remove duplicate records
initial_shape = crick_df.shape  # Get the shape before removing duplicates
crick_df = crick_df.drop_duplicates()
print(f"\nRemoved {initial_shape[0] - crick_df.shape[0]} duplicate rows.")

# Check for null values
print("\nNull Values Count:")
print(crick_df.isnull().sum())  # Count null values in each column

# Remove rows with null values
crick_df = crick_df.dropna()
print("\nAfter removing null values:")
print(crick_df.info())  # Check structure again after cleaning

# Final DataFrame shape
print(f"\nFinal DataFrame Shape: {crick_df.shape}")


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 528 entries, 0 to 527
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0.1     528 non-null    int64  
 1   Unnamed: 0       528 non-null    int64  
 2   date             364 non-null    object 
 3   venue            528 non-null    object 
 4   match_category   528 non-null    object 
 5   team_1           528 non-null    object 
 6   team_2           528 non-null    object 
 7   team_1_runs      518 non-null    float64
 8   team_1_wickets   518 non-null    float64
 9   team_2_runs      513 non-null    float64
 10  team_2_wickets   513 non-null    float64
 11  result           528 non-null    object 
 12  pom              510 non-null    object 
 13  best_batters     250 non-null    object 
 14  best_bowlers     250 non-null    object 
 15  commentary_line  83 non-null     object 
 16  world_cup_year   528 non-null    int64  
 17  

In [5]:
import pandas as pd
import numpy as np

# Step 1: Check for Outliers
# Focus on numeric columns
numeric_columns = crick_df.select_dtypes(include=['int64', 'float64']).columns
print("\nNumeric Columns:")
print(numeric_columns)

# Use Interquartile Range (IQR) to detect outliers
for col in numeric_columns:
    Q1 = crick_df[col].quantile(0.25)  # 25th percentile
    Q3 = crick_df[col].quantile(0.75)  # 75th percentile
    IQR = Q3 - Q1  # Interquartile range
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify outliers
    outliers = crick_df[(crick_df[col] < lower_bound) | (crick_df[col] > upper_bound)]
    print(f"\nOutliers detected in {col}: {len(outliers)} rows")

    # Remove outliers only if they are significant
    if len(outliers) > 0:
        print(f"Removing outliers in {col} beyond [{lower_bound}, {upper_bound}]")
        crick_df = crick_df[(crick_df[col] >= lower_bound) & (crick_df[col] <= upper_bound)]

# Step 2: Check for Missing Values
print("\nMissing Values Count After Outlier Removal:")
missing_values = crick_df.isnull().sum()
print(missing_values)

# Impute missing values only if necessary
for col in crick_df.columns:
    if missing_values[col] > 0:
        # State reason for imputation: Based on type of data
        if col in numeric_columns:
            print(f"Imputing missing values in {col} with mean value.")
            crick_df[col] = crick_df[col].fillna(crick_df[col].mean())
        else:
            print(f"Imputing missing values in {col} with mode value.")
            crick_df[col] = crick_df[col].fillna(crick_df[col].mode()[0])

# Final Check
print("\nFinal DataFrame Info After Outlier Removal and Imputation:")
print(crick_df.info())

# Display updated DataFrame structure
print("\nFinal DataFrame Preview:")
print(crick_df.head())



Numeric Columns:
Index(['Unnamed: 0.1', 'Unnamed: 0', 'team_1_runs', 'team_1_wickets',
       'team_2_runs', 'team_2_wickets', 'world_cup_year'],
      dtype='object')

Outliers detected in Unnamed: 0.1: 0 rows

Outliers detected in Unnamed: 0: 0 rows

Outliers detected in team_1_runs: 0 rows

Outliers detected in team_1_wickets: 0 rows

Outliers detected in team_2_runs: 1 rows
Removing outliers in team_2_runs beyond [82.0, 346.0]

Outliers detected in team_2_wickets: 0 rows

Outliers detected in world_cup_year: 0 rows

Missing Values Count After Outlier Removal:
Unnamed: 0.1       0
Unnamed: 0         0
date               0
venue              0
match_category     0
team_1             0
team_2             0
team_1_runs        0
team_1_wickets     0
team_2_runs        0
team_2_wickets     0
result             0
pom                0
best_batters       0
best_bowlers       0
commentary_line    0
world_cup_year     0
host_country       0
dtype: int64

Final DataFrame Info After Outlier Re

In [6]:
# Step 1: Create match_status column
# If 'result' contains 'abandoned', set match_status to 'abandoned'; otherwise, 'played'
crick_df['match_status'] = crick_df['result'].apply(
    lambda x: 'abandoned' if 'abandoned' in x.lower() else 'played'
)

# Step 2: Create winning_team column
def extract_winning_team(result):
    if 'abandoned' in result.lower():
        return None
    elif 'won by' in result:
        return result.split(' won by')[0]
    return None

crick_df['winning_team'] = crick_df['result'].apply(extract_winning_team)

# Step 3: Extract best_batters and best_bowlers into new columns
# Split and process 'best_batters'
def split_batters(batter_list):
    if isinstance(batter_list, list) and len(batter_list) > 0:
        # Extract names and runs
        batters = [b.split(' - ')[0] for b in batter_list]
        runs = [int(b.split(' - ')[1].split()[0]) for b in batter_list]
        return batters + runs
    return [None, None, None, None]  # Fill missing values if no data

# Create new columns for batters
crick_df[['best_batter_1', 'best_batter_2', 'best_batter_1_runs', 'best_batter_2_runs']] = crick_df['best_batters'].apply(
    split_batters
).apply(pd.Series)

# Split and process 'best_bowlers'
def split_bowlers(bowler_list):
    if isinstance(bowler_list, list) and len(bowler_list) > 0:
        # Extract names and wickets
        bowlers = [b.split(' - ')[0] for b in bowler_list]
        wickets = [int(b.split(' - ')[1]) for b in bowler_list]
        return bowlers + wickets
    return [None, None, None, None]  # Fill missing values if no data

# Create new columns for bowlers
crick_df[['best_bowler_1', 'best_bowler_2', 'best_bowler_1_wick', 'best_bowler_2_wick']] = crick_df['best_bowlers'].apply(
    split_bowlers
).apply(pd.Series)

# Step 4: Final Preview of the DataFrame
print(crick_df[['match_status', 'winning_team', 'best_batter_1', 'best_batter_2', 'best_batter_1_runs', 'best_batter_2_runs', 
                'best_bowler_1', 'best_bowler_2', 'best_bowler_1_wick', 'best_bowler_2_wick']].head())



    match_status  winning_team best_batter_1 best_batter_2 best_batter_1_runs  \
326       played     Australia          None          None               None   
327       played         India          None          None               None   
328       played  South Africa          None          None               None   
329       played      Zimbabwe          None          None               None   
332       played      Pakistan          None          None               None   

    best_batter_2_runs best_bowler_1 best_bowler_2 best_bowler_1_wick  \
326               None          None          None               None   
327               None          None          None               None   
328               None          None          None               None   
329               None          None          None               None   
332               None          None          None               None   

    best_bowler_2_wick  
326               None  
327               None  

In [7]:
# Step 1: Identify columns to drop
# Specify columns to remove (modify this list as necessary)
columns_to_drop = ['commentary_line']

# Drop the columns
crick_df = crick_df.drop(columns=columns_to_drop, errors='ignore')  # Use errors='ignore' to avoid errors if column not found

# Step 2: Verify the updated DataFrame structure
print("\nUpdated DataFrame Info:")
print(crick_df.info())

# Display the remaining columns
print("\nRemaining Columns:")
print(crick_df.columns)



Updated DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 72 entries, 326 to 527
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0.1        72 non-null     int64  
 1   Unnamed: 0          72 non-null     int64  
 2   date                72 non-null     object 
 3   venue               72 non-null     object 
 4   match_category      72 non-null     object 
 5   team_1              72 non-null     object 
 6   team_2              72 non-null     object 
 7   team_1_runs         72 non-null     float64
 8   team_1_wickets      72 non-null     float64
 9   team_2_runs         72 non-null     float64
 10  team_2_wickets      72 non-null     float64
 11  result              72 non-null     object 
 12  pom                 72 non-null     object 
 13  best_batters        72 non-null     object 
 14  best_bowlers        72 non-null     object 
 15  world_cup_year      72 non-null     

In [12]:
# Step 1: Import libraries
import pandas as pd
import transformers
from transformers import pipeline
import matplotlib.pyplot as plt

# Step 2: Load the data
file_path = r"C:\Users\suhan pahalage\Downloads\CASE2\CASE2\commentary_2023.csv"  # Replace with the correct file path
data = pd.read_csv(file_path)

# Preview the dataset
print("\nDataset Head:")
print(data.head())

# Step 3: Select and initialize the Hugging Face sentiment analysis model
# Using distilbert-base-uncased-finetuned-sst-2-english
print("\nLoading sentiment analysis model...")
sentiment_model = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Step 4: Detect sentiment for each commentary
print("\nDetecting sentiment for each commentary...")
data['sentiment'] = data['commentary'].apply(lambda x: sentiment_model(x)[0]['label'])

# Preview the dataset with sentiment
print("\nDataset with Sentiment Column:")
print(data.head())

# Step 5: Visualize the sentiment spread
# Count the number of positive and negative sentiments
sentiment_counts = data['sentiment'].value_counts()

# Plot the sentiment spread
plt.figure(figsize=(8, 6))
sentiment_counts.plot(kind='bar', color=['skyblue', 'orange'])
plt.title("Sentiment Spread of Commentary Excerpts")
plt.xlabel("Sentiment")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.show()

# Save the updated dataset
output_file = "commentary_with_sentiments.csv"
data.to_csv(output_file, index=False)
print(f"\nUpdated dataset saved to: {output_file}")



Dataset Head:
   Unnamed: 0                                         commentary
0          19  Travis Head | Player of the Match - 137(120): ...
1          20  Adam Zampa: Interesting to see the strategy to...
2          21  Mitchell Starc: He (Cummins) was phenomenal, h...
3          22  Steve Smith: Incredible feeling! Atmosphere wa...
4          23  Mitch Marsh: Pure elation right now. Been the ...

Loading sentiment analysis model...


RuntimeError: At least one of TensorFlow 2.0 or PyTorch should be installed. To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ To install PyTorch, read the instructions at https://pytorch.org/.

In [6]:
pip install transformers

Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
     ---------------------------------------- 0.0/44.1 kB ? eta -:--:--
     --------- ------------------------------ 10.2/44.1 kB ? eta -:--:--
     ----------------- -------------------- 20.5/44.1 kB 165.2 kB/s eta 0:00:01
     -------------------------- ----------- 30.7/44.1 kB 163.8 kB/s eta 0:00:01
     -------------------------- ----------- 30.7/44.1 kB 163.8 kB/s eta 0:00:01
     ----------------------------------- -- 41.0/44.1 kB 140.3 kB/s eta 0:00:01
     -------------------------------------- 44.1/44.1 kB 135.6 kB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.26.3-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.3-cp311-none-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp311-none-win_amd64.

In [11]:
pip install --upgrade transformers
pip install torch

SyntaxError: invalid syntax (3561013074.py, line 1)