In [None]:
pip install pandas

In [None]:
pip install matplotlib

In [None]:
pip install seaborn

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the dataset
data = pd.read_csv('../data/wnba_pbp_2018.csv') 

# Display basic information about the dataset
print(data.info())

# Display the first few rows of the dataset
print(data.head())

# Describe the dataset to get statistical summaries of numerical columns
print(data.describe())

# Check for missing values
print(data.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84811 entries, 0 to 84810
Data columns (total 61 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Unnamed: 0                       84811 non-null  int64  
 1   id                               84811 non-null  int64  
 2   sequence_number                  84811 non-null  int64  
 3   type_id                          84811 non-null  int64  
 4   type_text                        84811 non-null  object 
 5   text                             84808 non-null  object 
 6   away_score                       84811 non-null  int64  
 7   home_score                       84811 non-null  int64  
 8   period_number                    84811 non-null  int64  
 9   period_display_value             84811 non-null  object 
 10  clock_display_value              84811 non-null  object 
 11  scoring_play                     84811 non-null  bool   
 12  score_value       

In [None]:
# Generate histograms for all numerical columns to understand distributions
data.hist(figsize=(12, 10))
plt.show()

In [None]:
# Check data types of each column
print(data.dtypes)

In [None]:
# Select only the numeric columns of the DataFrame
numeric_data = data.select_dtypes(include=[np.number])

# Now compute the correlation matrix
correlation_matrix = numeric_data.corr()

In [None]:
# Generate the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.show()

In [3]:
print(data.head(10))

   Unnamed: 0           id  sequence_number  type_id             type_text  \
0           1   4010796954                4      615              Jumpball   
1           2   4010796957                7    20132   Step Back Jump Shot   
2           3   4010796959                9       12           Kicked Ball   
3           4  40107969510               10    30092             Jump Shot   
4           5  40107969511               11      156     Offensive Rebound   
5           6  40107969512               12       70   Shot Clock Turnover   
6           7  40107969513               13    20114  Turnaround Jump Shot   
7           8  40107969514               14      156     Offensive Rebound   
8           9  40107969515               15    20094              Tip Shot   
9          10  40107969516               16      156     Offensive Rebound   

                                                text  away_score  home_score  \
0  Breanna Stewart vs. LaToya Sanders (Elena Dell...         

In [7]:
pip install sentence-transformers faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (3.6 kB)
Downloading faiss_cpu-1.8.0-cp312-cp312-macosx_11_0_arm64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/Cellar/jupyterlab/4.0.11/libexec/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [8]:
from sentence_transformers import SentenceTransformer
import faiss

In [12]:
# Get column names
column_names = data.columns.tolist()
print(column_names)

['Unnamed: 0', 'id', 'sequence_number', 'type_id', 'type_text', 'text', 'away_score', 'home_score', 'period_number', 'period_display_value', 'clock_display_value', 'scoring_play', 'score_value', 'team_id', 'athlete_id_1', 'athlete_id_2', 'athlete_id_3', 'wallclock', 'shooting_play', 'coordinate_x_raw', 'coordinate_y_raw', 'season', 'season_type', 'away_team_id', 'away_team_name', 'away_team_mascot', 'away_team_abbrev', 'away_team_name_alt', 'home_team_id', 'home_team_name', 'home_team_mascot', 'home_team_abbrev', 'home_team_name_alt', 'home_team_spread', 'game_spread', 'home_favorite', 'game_spread_available', 'game_id', 'qtr', 'time', 'clock_minutes', 'clock_seconds', 'half', 'game_half', 'lead_qtr', 'lead_game_half', 'start_quarter_seconds_remaining', 'start_half_seconds_remaining', 'start_game_seconds_remaining', 'game_play_number', 'end_quarter_seconds_remaining', 'end_half_seconds_remaining', 'end_game_seconds_remaining', 'period', 'lag_qtr', 'lag_game_half', 'coordinate_x', 'coor

In [14]:
data['combined_text'] = data.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [16]:
texts = data['combined_text'].tolist()

In [17]:
from sentence_transformers import SentenceTransformer

# Load the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert text data to vectors (embeddings)
text_embeddings = model.encode(texts)



In [18]:
import faiss

# Number of dimensions of the embeddings
d = text_embeddings.shape[1]

# Build the index
index = faiss.IndexFlatL2(d)  # L2 distance (Euclidean distance)

# Add vectors to the index
index.add(text_embeddings)

# Save the index to a file
faiss.write_index(index, '../data/vector_store.index')

In [21]:
# Load the index from the file
index = faiss.read_index('../data/vector_store.index')

# Convert your query into a vector
query = "Who made the first scoring play in the first quarter?"
query_embedding = model.encode([query])

# Search the vector store
k = 5  # Number of nearest neighbors
D, I = index.search(query_embedding, k)

# I contains the indices of the nearest neighbors
print("Indices of the nearest neighbors:", I)

# Retrieve the rows corresponding to the indices
results = data.iloc[I[0]].to_dict(orient='records')
print("Query results:", results)

Indices of the nearest neighbors: [[17754 24948 25138 25250 25240]]
Query results: [{'Unnamed: 0': 17755, 'id': 401018950167, 'sequence_number': 167, 'type_id': 44, 'type_text': 'Shooting Foul', 'text': 'Imani McGee-Stafford shooting foul', 'away_score': 34, 'home_score': 36, 'period_number': 2, 'period_display_value': '2nd Quarter', 'clock_display_value': '7:27', 'scoring_play': False, 'score_value': 0, 'team_id': 20.0, 'athlete_id_1': 2984237.0, 'athlete_id_2': nan, 'athlete_id_3': nan, 'wallclock': '2018-08-07T23:33:38Z', 'shooting_play': False, 'coordinate_x_raw': 24, 'coordinate_y_raw': 3.0, 'season': 2018, 'season_type': 2, 'away_team_id': 17, 'away_team_name': 'Las Vegas', 'away_team_mascot': 'Aces', 'away_team_abbrev': 'LV', 'away_team_name_alt': 'Las Vegas', 'home_team_id': 20, 'home_team_name': 'Atlanta', 'home_team_mascot': 'Dream', 'home_team_abbrev': 'ATL', 'home_team_name_alt': 'Atlanta', 'home_team_spread': 2.5, 'game_spread': 2.5, 'home_favorite': True, 'game_spread_ava