## Trinidad and Tobago Lotto Plus Data
### ID: 816030232          Name: Justyn Caesar
### COMP3610 Assignment 1

#### This notebook demonstrates how to scrape, extract and format data from a website and use it to present useful information.

In [64]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
from datetime import datetime as dt

### Scraping NLCB website and extracting necessary fields  

In [None]:
#Making a list of all months and years in a Month-YY format for get requests
date_range = [i.strftime('%b-%y') for i in pd.date_range('2010-01-01','2023-12-31', freq='1MS')]

data_list = []        # To store all data except date
date_list = []        # To store all date information since it is located in a different part of the html
for da in date_range:
    url = f'https://www.nlcbplaywhelotto.com/nlcb-lotto-plus-results/?monthyear={da}'
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    data = soup.find_all('tr', class_='lotto-tr')
    date = soup.find_all('tr', class_='lotto-date-tr')
    for d in data:  
        data2 = d.text.split()
        data_list.append(data2)
    for da in date:
        date2 = da.text.split()
        date_list.append(date2)

### Fixing the error where Jackpot and Wins became one value

In [None]:
result = []
for row in data_list:
    updated_row = []
    for item in row:
        # Find the jackpot attribute
        if item.startswith('$'):
            remove_last = item[:-1]   # Remove trailing digit
            last_digit = (item[-1])   # Extract last digit
            updated_row.extend([remove_last, last_digit])  #
        else:
            updated_row.append(item)
    result.append(updated_row)

new_date_list = [item for sublist in date_list for item in sublist] 
date_list = [dt.strptime(p, '%d-%b-%y').strftime('%d/%m/%Y') for p in new_date_list]  #Formatting date

df = pd.DataFrame(result, columns = ["Draw #", 'Numbers', 'Power Ball', 'Multiplier', 'Jackpot', 'Wins'])
df.insert(0,'Draw Date', date_list)

df.to_csv('Results.csv', sep = ",", index = False)

### Reloading the dataset as the site became unavailable for scraping

In [118]:
df = pd.read_csv('Results.csv')

### Cleaning data for better analysis

In [106]:
df['Draw #'] = pd.to_numeric(df['Draw #'], errors='coerce').astype('Int64')
df['Wins'] = pd.to_numeric(df['Wins'], errors='coerce').astype('Int64')
df['Jackpot'] = df['Jackpot'].replace('XX', np.nan)
jackpot_series = pd.to_numeric(df["Jackpot"].replace({r'\$|,': ''}, regex=True), errors='coerce')

## Sumamry Statistics

In [117]:
overview = df.describe()
data_types = df.dtypes

max_jackpot = jackpot_series.max()
max_jackpot = f'${max_jackpot:,.2f}'

avg_jackpot = f'${jackpot_series.mean():,.2f}'

print(f'Datatypes in the dataset:\n\n {data_types}\n\n')
print(f'Overview of data:\n\n {overview}\n')
print(f'Jackpot overview:\n {jackpot_series.describe()}\n')
print(f'Largest Jackpot between January 2010 and January 2025:\n {max_jackpot}\n')
print(f'Average Jackpot between January 2010 and January 2025:\n {avg_jackpot}\n')
print(f'The most wins in 1 draw between January 2010 and January 2025:\n {df['Wins'].max()}\n')


Datatypes in the dataset:

 Draw Date      object
Draw #          Int64
Numbers        object
Power Ball    float64
Multiplier    float64
Jackpot        object
Wins            Int64
dtype: object


Overview of data:

        Draw #  Power Ball  Multiplier    Wins
count 1403.00     1403.00      134.00 1357.00
mean  1587.00        5.50        8.43    0.10
std    405.16        2.94        6.76    0.32
min    886.00        0.00        3.00    0.00
25%   1236.50        3.00        4.00    0.00
50%   1587.00        6.00        4.00    0.00
75%   1937.50        8.00       10.00    0.00
max   2288.00       10.00       20.00    3.00

Jackpot overview:
 count       1357.00
mean     4499672.66
std      3983562.00
min            1.00
25%      2000000.00
50%      2887858.21
75%      5820630.46
max     31946408.24
Name: Jackpot, dtype: float64

Largest Jackpot between January 2010 and January 2025:
 $31,946,408.24

Average Jackpot between January 2010 and January 2025:
 $4,499,672.66

The most wins 