In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import random

FILE_NAME = 'hotpicks.csv'

## Create a DataFrame from the csv file

#### Vectorized operations

In [4]:
# def stats() -> pd.DataFrame:
#     # Read CSV
#     df = pd.read_csv(FILE_NAME)

#     # Drop unnecessary columns
#     df.drop(['Ball Set', 'Machine', 'DrawNumber', 'DrawDate'], axis=1, inplace=True)
        
#     # Flatten the DataFrame to a Pandas Series converting the numbers into a single array.
#     all_numbers = pd.Series(df.values.ravel())
    
#     # Count frequencies using pandas
#     counts = all_numbers.value_counts()

#     # Create result DataFrame
#     return pd.DataFrame({'Numbers': counts.index, 'Frequency': counts.values})

#### Fully Vectorized and returning a Pandas DataFrame

In [17]:
# def stats() -> pd.DataFrame:
#     # Read CSV
#     df = pd.read_csv(FILE_NAME)

#     # Drop unnecessary columns
#     df.drop(['Ball Set', 'Machine', 'DrawNumber', 'DrawDate'], axis=1, inplace=True)
        
#     # Flatten the DataFrame converting the numbers into a single Numpy array.
#     all_numbers = df.values.ravel()
    
#     # Get counts using numpy's unique, which is optimized and vectorized
#     # Return a tuple with 2 arrays, the values and their occurrences
#     unique_counts = np.unique(all_numbers, return_counts=True)

#     # Create a DataFrame
#     result_df = pd.DataFrame({
#         'Numbers': unique_counts[0],
#         'Frequency': unique_counts[1]
#     }).sort_values(by='Frequency', ascending=False).reset_index(drop=True) # reset the index order after sorting
    
#     return result_df

## Create a Numpy Array from the csv file

#### Fully Vectorize and returning a numpy array

In [2]:
def stats() -> np.ndarray:
    # Read CSV
    df = pd.read_csv(FILE_NAME)

    # Drop unnecessary columns
    df.drop(['Ball Set', 'Machine', 'DrawNumber', 'DrawDate'], axis=1, inplace=True)
        
    # Flatten the DataFrame converting the numbers into a single Numpy array.
    return df.values.ravel()

df2 = stats()

### Create a DataFrame with the top 20 numbers with the higher occurrences

#### For loop function. Slow for large dataset

In [189]:
# def rand_numb_range(numbers: pd.Series) -> Counter:
#     """Randomly select a number from a pandas Series and count occurrences over 1,000,000 iterations."""
    
#     my_counter = Counter()
    
#     for _ in range(1_000_000):
#         num = numbers.iloc[random.randrange(0, len(numbers))]
#         my_counter[num] += 1
    
#     return my_counter

#### batch sampling function

In [190]:
# def rand_numb_range(numbers: pd.Series) -> pd.DataFrame:
#     """Randomly select a number from a pandas Series and count occurrences over 1,000,000 iterations."""
    
#     my_counter = Counter()
#     # Number of iterations
#     total_iterations = 1_000_000

#     # Convert Series to list for sampling. 
#     # random.choices requires a list not a pd.Series
#     number_list = numbers.tolist()

#     # Sample all at once
#     samples = random.choices(number_list, k=total_iterations)

#     # Count frequencies
#     my_counter.update(samples)
    
#     # Create result DataFrame
#     return pd.DataFrame({'Numbers':my_counter.keys(),'Frequency':my_counter.values()})

#### vectorized operation

In [158]:
# def rand_numb_range(numbers: pd.Series) -> pd.DataFrame:
#     """Randomly select a number from a pandas Series and count occurrences over 1,000,000 iterations."""
    
#     my_counter = Counter()
#     # Number of iterations
#     total_iterations = 1_000_000
    
#     # Convert Series to list for sampling. 
#     # random.choices requires a list not a pd.Series
#     number_list = numbers.tolist()

#     # Use random.choices to sample all at once for efficiency
#     samples = random.choices(number_list, k = total_iterations)
#     # convert list to Pandas Series
#     samples_pd = pd.Series(samples)
   
#     # Count frequencies using pandas
#     counts = samples_pd.value_counts()

#     # Create result DataFrame
#     return pd.DataFrame({'Numbers': counts.index, 'Frequency': counts.values})

#### Fully Vectorized

In [3]:
def rand_numb_range_vectorized(numbers: np.ndarray) -> pd.DataFrame:
    """
    Sample a large number of values from a Numpy array using NumPy for maximum efficiency,
    and return a DataFrame with the numbers and their frequencies.
    """
    total_iterations = 100_000_000  
    
    # vectorized random numbers
    samples = np.random.choice(numbers, size=total_iterations, replace=True) # replace=True => numbers could be repeated
    
    # Get counts using numpy's unique, which is optimized and vectorized
    unique_counts = np.unique(samples, return_counts=True)
    
    # Create a DataFrame
    result_df = pd.DataFrame({
        'Numbers': unique_counts[0],
        'Frequency': unique_counts[1]
    }).sort_values(by='Frequency', ascending=False).reset_index(drop=True)
    
    return result_df

df = rand_numb_range_vectorized(df2)
# Return the 20 numbers with the highest frequency
dflargest = df.nlargest(20, 'Frequency')

### Returns 3 random numbers chosen from the DataFrame

#### with set and while loop

In [65]:
# Return a set of 3 random numbers chosen from the 20 with the highest frequency
# tries = set()
# len(tries)
# while len(tries) <= 2:
#   tries.add(dflargest['Numbers'].loc[dflargest.index[random.randrange(len(dflargest))]])

# tries  

#### whith set and sample

In [62]:
# # DataFrame with top 20 numbers sorted by frequency
# top_20 = dflargest.head(20)['Numbers']
# # Randomly sample 3 unique numbers without repetition
# tries = set(random.sample(top_20.tolist(), 3))

# print(tries)

#### vectorized operations using NumPy

In [4]:
# DataFrame with top 20 numbers sorted by frequency
top_20 = dflargest.head(20)['Numbers'].to_numpy()

# vectorized random numbers
samples = np.random.choice(top_20, size=3, replace=False) # replace=False => numbers could not be repeated

# print 3 random numbers
print(samples)

[16  7  3]


### Returns 4 random numbers chosen from the DataFrame

In [79]:
# vectorized random numbers
samples = np.random.choice(top_20, size=4, replace=False) # replace=False => numbers could not be repeated

# print 4 random numbers
print(samples) 

[19 43 31  7]


In [5]:
%matplotlib inline

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

import cufflinks as cf
init_notebook_mode(connected=True)

cf.go_offline()

### Graph iPlot

In [6]:
dflargest.iplot(kind='bar',x='Numbers',y='Frequency')