# We match our rating data to the main dataset

## Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from datetime import date

## Loading data

In [2]:
ratings_df = pd.read_csv('Prepared Frames/rating_data.csv')
df_companies = pd.read_csv('Prepared Frames/companies_approx_match.csv')
main_df = pd.read_csv('Prepared Frames/main_data.csv')

## Merging the found matches to the main Dataframe

In [3]:
main_df = pd.merge(left = main_df,
                                 right = df_companies[['Ticker','reduced_matches']],
                                 # We use outer to not loose Information, we will have to get rid of some NaNs later
                                 how = 'left', 
                                 left_on = ['Ticker'], 
                                 right_on= ['Ticker'])

## Assigning the ratings
We now have to assign the ratings. The difficulty is, that we don't have the exact dates to match, but have to match when the date in the main dataframe is greater than the one for the rating and have to overwrite it with a new rating in case there is one.

In [4]:
# We can only assign ratings where we have found the companies, 
# we create a new dataframe just with these entries
df_rated = main_df[main_df['reduced_matches'].isna()==False].copy()

In [5]:
# We sort the dataframe by dates
df_rated = df_rated.sort_values(by = ['Report Date'], ascending = True)
df_rated.head(3)

Unnamed: 0,Ticker,Company Name,IndustryId,Fiscal Year,Fiscal Period,Report Date,Shares (Diluted),"Cash, Cash Equivalents & Short Term Investments",Accounts & Notes Receivable,Total Current Assets,...,Abnormal Gains (Losses),Pretax Income (Loss),"Income Tax (Expense) Benefit, Net",Income (Loss) from Continuing Operations,Net Extraordinary Gains (Losses),Net Income,Net Income (Common),Sector,Industry,reduced_matches
18826,LE,"LANDS' END, INC.",103002.0,2016,Q4,2017-01-31,31750000.0,213108000.0,39284000.0,607400000.0,...,0.0,-157603000.0,62782000.0,-94821000.0,0.0,-94821000.0,-94821000.0,Consumer Cyclical,Retail - Apparel & Specialty,lands end
33972,UNFI,UNITED NATURAL FOODS INC,102001.0,2017,Q2,2017-01-31,50755000.0,30658000.0,514870000.0,1638398000.0,...,0.0,42028000.0,-16546000.0,25482000.0,0.0,25482000.0,25482000.0,Consumer Defensive,Retail - Defensive,united natural foods
29355,SFS,"Smart & Final Stores, Inc.",102001.0,2016,Q4,2017-01-31,82302386.0,54235000.0,31809000.0,435636000.0,...,0.0,-1903000.0,1650000.0,-253000.0,0.0,-253000.0,-253000.0,Consumer Defensive,Retail - Defensive,smart final stores llc


In [6]:
# We create a list of all unique companies in our ratings dataframe
companies = list(df_rated['reduced_matches'].unique())

In [7]:
len(companies)

652

#### Checking if we can match right

In [8]:
df_rated[df_rated['reduced_matches']==companies[0]].head(2)

Unnamed: 0,Ticker,Company Name,IndustryId,Fiscal Year,Fiscal Period,Report Date,Shares (Diluted),"Cash, Cash Equivalents & Short Term Investments",Accounts & Notes Receivable,Total Current Assets,...,Abnormal Gains (Losses),Pretax Income (Loss),"Income Tax (Expense) Benefit, Net",Income (Loss) from Continuing Operations,Net Extraordinary Gains (Losses),Net Income,Net Income (Common),Sector,Industry,reduced_matches
18826,LE,"LANDS' END, INC.",103002.0,2016,Q4,2017-01-31,31750000.0,213108000.0,39284000.0,607400000.0,...,0.0,-157603000.0,62782000.0,-94821000.0,0.0,-94821000.0,-94821000.0,Consumer Cyclical,Retail - Apparel & Specialty,lands end
18827,LE,"LANDS' END, INC.",103002.0,2017,Q1,2017-04-30,32002000.0,139810000.0,32731000.0,523764000.0,...,0.0,-12103000.0,4264000.0,-7839000.0,0.0,-7839000.0,-7839000.0,Consumer Cyclical,Retail - Apparel & Specialty,lands end


In [9]:
ratings_df[ratings_df['reduced_name']==companies[0]]

Unnamed: 0,Company,Date,Rating,Company_lower,reduced_name
48056,"Lands' End, Inc.",2014-03-05,B1,"lands' end, inc.",lands end
48057,"Lands' End, Inc.",2016-09-16,B2,"lands' end, inc.",lands end
48058,"Lands' End, Inc.",2017-04-27,B3,"lands' end, inc.",lands end
48059,"Lands' End, Inc.",2014-03-05,B1,"lands' end, inc.",lands end
48060,"Lands' End, Inc.",2016-09-16,B2,"lands' end, inc.",lands end
48061,"Lands' End, Inc.",2017-04-27,B3,"lands' end, inc.",lands end
48062,"Lands' End, Inc.",2020-06-23,Caa1,"lands' end, inc.",lands end
48063,"Lands' End, Inc.",2020-10-16,B3,"lands' end, inc.",lands end


In [10]:
ratings_df.columns

Index(['Company', 'Date', 'Rating', 'Company_lower', 'reduced_name'], dtype='object')

#### We have to make sure to overwrite only the greater dates with updated rating information
We will test it manually for the first company and do the rest with a function

In [11]:
# First create a sub-dataframe only with the ratings we are interested in
current_ratings = ratings_df[ratings_df['reduced_name']==companies[0]].sort_values(by = ['Date'], ascending = True).reset_index(drop = True)
current_ratings

Unnamed: 0,Company,Date,Rating,Company_lower,reduced_name
0,"Lands' End, Inc.",2014-03-05,B1,"lands' end, inc.",lands end
1,"Lands' End, Inc.",2014-03-05,B1,"lands' end, inc.",lands end
2,"Lands' End, Inc.",2016-09-16,B2,"lands' end, inc.",lands end
3,"Lands' End, Inc.",2016-09-16,B2,"lands' end, inc.",lands end
4,"Lands' End, Inc.",2017-04-27,B3,"lands' end, inc.",lands end
5,"Lands' End, Inc.",2017-04-27,B3,"lands' end, inc.",lands end
6,"Lands' End, Inc.",2020-06-23,Caa1,"lands' end, inc.",lands end
7,"Lands' End, Inc.",2020-10-16,B3,"lands' end, inc.",lands end


In [12]:
for action in range(len(current_ratings)):
    # Getting the needed information
    company = current_ratings.loc[action,'reduced_name']
    date = current_ratings.loc[action,'Date']
    rating = current_ratings.loc[action,'Rating']
    
    # We need to adress the part of the dataframe where the company is the same and the date higher
    df_rated.loc[(df_rated['reduced_matches']==company)&(df_rated['Report Date']>=date),'Rating'] = rating

In [13]:
# Checking the result
# df_rated[df_rated['reduced_matches']==companies[0]]

# That worked correctly, we can apply it to our whole dataframe

In [14]:
def assign_ratings():
    # List of companies 
    companies = list(df_rated['reduced_matches'].unique())
    for company in companies:
        # Creating the subframe
        current_ratings = ratings_df[ratings_df['reduced_name']==company].sort_values(by = ['Date'], ascending = True).reset_index(drop = True)
        for action in range(len(current_ratings)):
            # Getting the needed information
            company = current_ratings.loc[action,'reduced_name']
            date = current_ratings.loc[action,'Date']
            rating = current_ratings.loc[action,'Rating']
             # We need to adress the part of the dataframe where the company is the same and the date higher
            df_rated.loc[(df_rated['reduced_matches']==company)&(df_rated['Report Date']>=date),'Rating'] = rating

In [15]:
# We apply the ratings
assign_ratings()

In [16]:
# Checking results
df_rated.isna().sum()

Ticker                                                0
Company Name                                          0
IndustryId                                            0
Fiscal Year                                           0
Fiscal Period                                         0
Report Date                                         145
Shares (Diluted)                                    145
Cash, Cash Equivalents & Short Term Investments     145
Accounts & Notes Receivable                         145
Total Current Assets                                145
Property, Plant & Equipment, Net                    145
Other Long Term Assets                              145
Total Noncurrent Assets                             145
Total Assets                                        145
Payables & Accruals                                 145
Total Current Liabilities                           145
Total Noncurrent Liabilities                        145
Total Liabilities                               

In [17]:
# We got almost everything rated
df_rated[df_rated['Rating'].isna()]

Unnamed: 0,Ticker,Company Name,IndustryId,Fiscal Year,Fiscal Period,Report Date,Shares (Diluted),"Cash, Cash Equivalents & Short Term Investments",Accounts & Notes Receivable,Total Current Assets,...,Pretax Income (Loss),"Income Tax (Expense) Benefit, Net",Income (Loss) from Continuing Operations,Net Extraordinary Gains (Losses),Net Income,Net Income (Common),Sector,Industry,reduced_matches,Rating
33972,UNFI,UNITED NATURAL FOODS INC,102001.0,2017,Q2,2017-01-31,50755000.0,3.065800e+07,514870000.0,1.638398e+09,...,4.202800e+07,-1.654600e+07,2.548200e+07,0.0,2.548200e+07,2.548200e+07,Consumer Defensive,Retail - Defensive,united natural foods,
20063,M,"Macy's, Inc.",103002.0,2016,Q4,2017-01-31,307800000.0,1.297000e+09,522000000.0,7.626000e+09,...,7.280000e+08,-2.560000e+08,4.720000e+08,0.0,4.750000e+08,4.750000e+08,Consumer Cyclical,Retail - Apparel & Specialty,macys,
32343,TIVO,TIVO INC,101003.0,2016,Q4,2017-01-31,67381277.0,6.460690e+08,55654000.0,7.358850e+08,...,-3.277000e+06,3.476000e+06,1.990000e+05,0.0,1.990000e+05,1.990000e+05,Technology,Application Software,tivo,
21862,MRVL,MARVELL TECHNOLOGY GROUP LTD,101004.0,2016,Q4,2017-01-31,520623000.0,1.668360e+09,335384000.0,2.290434e+09,...,4.819000e+06,-6.834500e+07,-6.352600e+07,0.0,-8.009100e+07,-8.009100e+07,Technology,Semiconductors,marvell technology,
15683,HQY,HEALTHEQUITY INC,101003.0,2016,Q4,2017-01-31,60453000.0,1.803590e+08,17001000.0,2.008190e+08,...,6.023000e+06,-1.961000e+06,4.062000e+06,0.0,4.062000e+06,4.062000e+06,Technology,Application Software,healthequity,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37798,UNFI,UNITED NATURAL FOODS INC,102001.0,2022,Q2,,,,,,...,9.300000e+07,-2.500000e+07,6.800000e+07,0.0,6.600000e+07,6.600000e+07,Consumer Defensive,Retail - Defensive,united natural foods,
37811,VRNT,VERINT SYSTEMS INC,101003.0,2021,Q4,,,,,,...,6.115000e+06,-1.037500e+07,-4.260000e+06,0.0,-4.623000e+06,-9.823000e+06,Technology,Application Software,verint systems,
37812,VSCO,Victoria's Secret & Co.,103004.0,2021,Q4,,,,,,...,3.220000e+08,-7.600000e+07,2.460000e+08,0.0,2.460000e+08,2.460000e+08,Consumer Cyclical,Manufacturing - Apparel & Furniture,victorias secret,
37821,WMT,Walmart Inc,102001.0,2021,Q4,,,,,,...,4.782000e+09,-1.149000e+09,3.633000e+09,0.0,3.562000e+09,3.562000e+09,Consumer Defensive,Retail - Defensive,walmart,


I looked through a few of them and realized, that the NaNs are created for the quarters where there is no rating available yet for the companies, so we drop those lines.

In [18]:
df_rated.shape

(11185, 41)

In [19]:
df_rated = df_rated.dropna(axis='rows')

In [20]:
df_rated.shape

(8662, 41)

In [21]:
# By doing this, all other NaNs vanished also
df_rated.isna().sum()

Ticker                                             0
Company Name                                       0
IndustryId                                         0
Fiscal Year                                        0
Fiscal Period                                      0
Report Date                                        0
Shares (Diluted)                                   0
Cash, Cash Equivalents & Short Term Investments    0
Accounts & Notes Receivable                        0
Total Current Assets                               0
Property, Plant & Equipment, Net                   0
Other Long Term Assets                             0
Total Noncurrent Assets                            0
Total Assets                                       0
Payables & Accruals                                0
Total Current Liabilities                          0
Total Noncurrent Liabilities                       0
Total Liabilities                                  0
Share Capital & Additional Paid-In Capital    

## Saving our finalized dataframe

In [22]:
df_rated.to_csv('Prepared Frames/rated.csv', index=False)