In [1]:
import pandas as pd
import os
import re

In [2]:
""""
- Input each of the 12 monthly files
- Create a 'file date' using the month found in the file name
    The Null value should be replaced as 1
- Clean the Market Cap value to ensure it is the true value as 'Market Capitalisation'
    Remove any rows with 'n/a'
-Categorise the Purchase Price into groupings
    0 to 24,999.99 as 'Low'
    25,000 to 49,999.99 as 'Medium'
    50,000 to 74,999.99 as 'Large'
    75,000 to 100,000 as 'Very Large'
- Categorise the Market Cap into groupings
    Below $100M as 'Small'
    Between $100M and below $1B as 'Medium'
    Between $1B and below $100B as 'Large' 
    $100B and above as 'Huge'
- Rank the highest 5 purchases per combination of: file date, Purchase Price Categorisation and Market Capitalisation Categorisation.
Output only records with a rank of 1 to 5

"""

'"\n- Input each of the 12 monthly files\n- Create a \'file date\' using the month found in the file name\n    The Null value should be replaced as 1\n- Clean the Market Cap value to ensure it is the true value as \'Market Capitalisation\'\n    Remove any rows with \'n/a\'\n-Categorise the Purchase Price into groupings\n    0 to 24,999.99 as \'Low\'\n    25,000 to 49,999.99 as \'Medium\'\n    50,000 to 74,999.99 as \'Large\'\n    75,000 to 100,000 as \'Very Large\'\n- Categorise the Market Cap into groupings\n    Below $100M as \'Small\'\n    Between $100M and below $1B as \'Medium\'\n    Between $1B and below $100B as \'Large\' \n    $100B and above as \'Huge\'\n- Rank the highest 5 purchases per combination of: file date, Purchase Price Categorisation and Market Capitalisation Categorisation.\nOutput only records with a rank of 1 to 5\n\n'

In [5]:
# Define the directory where your CSV files are located
csv_dir = os.getcwd()

# Get the list of CSV files in the directory
csv_files = [file for file in os.listdir(csv_dir) if file.endswith('.csv')]

# Initialize an empty list to store the DataFrames
dfs = []

# Iterate through each CSV file
for csv_file in csv_files:
    # Extract the number from the file name
    try:
        month_number = int(re.search(r'-(\d+)', csv_file).group(1))
    except AttributeError:
        month_number =  1 
    # Read the CSV file into a DataFrame
    df = pd.read_csv(os.path.join(csv_dir, csv_file))
    
    # Add a "Month" column with the extracted month number
    df['Month'] = month_number

    # Append the DataFrame to the list
    dfs.append(df)

# Concatenate all the DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)


In [6]:
def convert_price(price):
        price = price.replace('$', '')
        
        if 'M' in price:
            price = float(re.sub(r'[^0-9.]', '', price)) * 1000000
        elif 'B' in price:
            price = float(re.sub(r'[^0-9.]', '', price)) * 1000000000
        else:
            price = float(price)
        
        return price

In [7]:
combined_df.head()
combined_df = combined_df[~combined_df['Market Cap'].isnull()]


combined_df['Market Cap'] = combined_df['Market Cap'].apply(convert_price)
combined_df['Purchase Price'] = combined_df['Purchase Price'].apply(convert_price)


In [8]:
def purchase_price_cat(df): 
    if df['Purchase Price'] <25000:
        return 'Low'
    elif df['Purchase Price'] < 50000: 
        return 'Medium'
    elif df['Purchase Price'] < 75000: 
        return 'Large'
    else:
        return 'Very Large'
    
combined_df['Purchase Price Category'] = combined_df.apply(purchase_price_cat, axis='columns')

In [9]:
def market_cap_cat(df): 
    if df['Market Cap'] < 100000000:
        return 'Small'
    elif df['Market Cap'] < 1000000000: 
        return 'Medium'
    elif df['Market Cap'] < 10000000000: 
        return 'Large'
    else:
        return 'Huge'
    
combined_df['Market Cap Category'] = combined_df.apply(market_cap_cat, axis='columns')

In [10]:
combined_df.head()

Unnamed: 0,id,first_name,last_name,Ticker,Sector,Market,Stock Name,Market Cap,Purchase Price,Month,Purchase Price Category,Market Cap Category
0,1,Vikki,Pascall,RIC,Basic Industries,NYSE,"Richmont Mines, Inc.",458830000.0,78924.65,10,Very Large,Medium
1,2,Beulah,Michallat,ELY,Consumer Non-Durables,NYSE,Callaway Golf Company,1140000000.0,89818.72,10,Very Large,Large
3,4,Willis,Bugg,OAS,Energy,NYSE,Oasis Petroleum Inc.,1940000000.0,23636.92,10,Low,Large
4,5,Balduin,Maffei,QQXT,,NASDAQ,First Trust NASDAQ-100 Ex-Technology Sector In...,101070000.0,65979.23,10,Large,Medium
5,6,Octavia,Rawll,IMKTA,Consumer Services,NASDAQ,"Ingles Markets, Incorporated",687820000.0,41824.21,10,Medium,Medium


In [11]:
df_sorted = combined_df.sort_values(by=['Market Cap Category', 'Purchase Price Category', 'Month', 'Purchase Price'], ascending=[True, True,True,False])


df_sorted['Rank'] = df_sorted.groupby(by=['Market Cap Category', 'Purchase Price Category', 'Month']).cumcount() +1 

In [12]:
df_sorted = df_sorted[df_sorted['Rank'] <=5 ]
df_sorted

Unnamed: 0,id,first_name,last_name,Ticker,Sector,Market,Stock Name,Market Cap,Purchase Price,Month,Purchase Price Category,Market Cap Category,Rank
11965,966,Artemas,Franzini,A,Capital Goods,NYSE,"Agilent Technologies, Inc.",1.912000e+10,74995.16,1,Large,Huge,1
11646,647,Harriot,Odhams,BCH,Finance,NYSE,Banco De Chile,1.286000e+10,73529.30,1,Large,Huge,2
11148,149,Agnesse,Mulcock,RELX,Consumer Services,NYSE,RELX PLC,4.523000e+10,72197.77,1,Large,Huge,3
11426,427,Way,O'Lagen,VTR,Consumer Services,NYSE,"Ventas, Inc.",2.467000e+10,71276.64,1,Large,Huge,4
11194,195,Alvie,Hills,APC,Energy,NYSE,Anadarko Petroleum Corporation,2.636000e+10,70530.32,1,Large,Huge,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2352,353,Welbie,Robbey,ORMP,Health Care,NASDAQ,Oramed Pharmaceuticals Inc.,9.891000e+07,99965.64,12,Very Large,Small,1
2117,118,Rice,Bentote,QAT,,NASDAQ,iShares MSCI Qatar Capped ETF,3.896000e+07,99349.08,12,Very Large,Small,2
2664,665,Juditha,Bengefield,APEN,Health Care,NASDAQ,"Apollo Endosurgery, Inc.",7.233000e+07,98709.71,12,Very Large,Small,3
2958,959,Thomasin,Edmed,PTIE,Health Care,NASDAQ,"Pain Therapeutics, Inc.",2.703000e+07,98696.95,12,Very Large,Small,4


In [None]:
df_sorted.to_csv('output.csv')