# We match our rating data to the main dataset

## Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from datetime import date

## Loading data

In [2]:
ratings_df = pd.read_csv('Prepared Frames/rating_data.csv')
df_companies = pd.read_csv('Prepared Frames/companies_approx_match.csv')
main_df = pd.read_csv('Prepared Frames/main_data.csv')

## Merging the found matches to the main Dataframe

In [3]:
main_df = pd.merge(left = main_df,
                                 right = df_companies[['Ticker','difflib']],
                                 # We use outer to not loose Information, we will have to get rid of some NaNs later
                                 how = 'left', 
                                 left_on = ['Ticker'], 
                                 right_on= ['Ticker'])

## Assigning the ratings
We now have to assign the ratings. The difficulty is, that we don't have the exact dates to match, but have to match when the date in the main dataframe is greater than the one for the rating and have to overwrite it with a new rating in case there is one.

In [5]:
# We can only assign ratings where we have found the companies, 
# we create a new dataframe just with these entries
df_rated = main_df[main_df['difflib'].isna()==False].copy()

In [15]:
# We sort the dataframe by dates
df_rated = df_rated.sort_values(by = ['Report Date'], ascending = True)
df_rated.head(3)

Unnamed: 0,Ticker,Company Name,IndustryId,Fiscal Year,Fiscal Period,Report Date,Shares (Diluted),"Cash, Cash Equivalents & Short Term Investments",Accounts & Notes Receivable,Total Current Assets,...,Change in Fixed Assets & Intangibles,Net Cash from Acquisitions & Divestitures,Net Cash from Investing Activities,Cash from (Repayment of) Debt,Cash from (Repurchase of) Equity,Net Cash from Financing Activities,Net Change in Cash,Sector,Industry,difflib
18103,KEYS,"Keysight Technologies, Inc.",101001.0,2017,Q1,2017-01-31,173000000.0,896000000.0,395000000.0,1932000000.0,...,-8000000.0,-62800000.0,-8000000.0,53692310.0,19000000.0,21000000.0,113000000,Technology,Computer Hardware,"keysight technologies, inc."
14236,GME,GameStop Corp.,103002.0,2016,Q4,2017-01-31,410400000.0,669400000.0,220900000.0,2140700000.0,...,-36900000.0,-100000.0,-35900000.0,-200000.0,-19800000.0,-58100000.0,311600000,Consumer Cyclical,Retail - Apparel & Specialty,gamestop corp.
28690,SAIC,Science Applications International Corp,101003.0,2016,Q4,2017-01-31,47000000.0,210000000.0,427000000.0,789000000.0,...,-4000000.0,0.0,-4000000.0,0.0,-41000000.0,-53000000.0,5000000,Technology,Application Software,science applications international corp


In [18]:
# Since we matched the names of the companies on the lowercase version, we have to create a column  with that version again
ratings_df['Company_lower'] = ratings_df['Company'].apply(lambda x: x.lower())

In [10]:
# We create a list of all unique companies in our ratings dataframe
companies = list(df_rated['difflib'].unique())

#### Checking if we can match right

In [20]:
df_rated[df_rated['difflib']==companies[0]].head(2)

Unnamed: 0,Ticker,Company Name,IndustryId,Fiscal Year,Fiscal Period,Report Date,Shares (Diluted),"Cash, Cash Equivalents & Short Term Investments",Accounts & Notes Receivable,Total Current Assets,...,Change in Fixed Assets & Intangibles,Net Cash from Acquisitions & Divestitures,Net Cash from Investing Activities,Cash from (Repayment of) Debt,Cash from (Repurchase of) Equity,Net Cash from Financing Activities,Net Change in Cash,Sector,Industry,difflib
18103,KEYS,"Keysight Technologies, Inc.",101001.0,2017,Q1,2017-01-31,173000000.0,896000000.0,395000000.0,1932000000.0,...,-8000000.0,-62800000.0,-8000000.0,53692310.0,19000000.0,21000000.0,113000000,Technology,Computer Hardware,"keysight technologies, inc."
18104,KEYS,"Keysight Technologies, Inc.",101001.0,2017,Q2,2017-04-30,179000000.0,1023000000.0,518000000.0,2314000000.0,...,-17000000.0,-62800000.0,-1634000000.0,1239000000.0,446000000.0,1656000000.0,87000000,Technology,Computer Hardware,"keysight technologies, inc."


In [19]:
ratings_df[ratings_df['Company_lower']==companies[0]]

Unnamed: 0,Company,Date,Rating,Company_lower
67445,"Keysight Technologies, Inc.",2014-09-12,Baa3,"keysight technologies, inc."
67446,"Keysight Technologies, Inc.",2019-08-14,Baa2,"keysight technologies, inc."


In [21]:
ratings_df.columns

Index(['Company', 'Date', 'Rating', 'Company_lower'], dtype='object')

#### We have to make sure to overwrite only the greater dates with updated rating information
We will test it manually for the first company and do the rest with a function

In [24]:
# First create a sub-dataframe only with the ratings we are interested in
current_ratings = ratings_df[ratings_df['Company_lower']==companies[0]].sort_values(by = ['Date'], ascending = True).reset_index(drop = True)
current_ratings

Unnamed: 0,Company,Date,Rating,Company_lower
0,"Keysight Technologies, Inc.",2014-09-12,Baa3,"keysight technologies, inc."
1,"Keysight Technologies, Inc.",2019-08-14,Baa2,"keysight technologies, inc."


In [27]:
for action in range(len(current_ratings)):
    # Getting the needed information
    company = current_ratings.loc[action,'Company_lower']
    date = current_ratings.loc[action,'Date']
    rating = current_ratings.loc[action,'Rating']
    
    # We need to adress the part of the dataframe where the company is the same and the date higher
    df_rated.loc[(df_rated['difflib']==company)&(df_rated['Report Date']>=date),'Rating'] = rating

In [31]:
# Checking the result
# df_rated[df_rated['difflib']==companies[0]]

That worked correctly, we can apply it to our whole dataframe

In [49]:
def assign_ratings():
    # List of companies 
    companies = list(df_rated['difflib'].unique())
    for company in companies:
        # Creating the subframe
        current_ratings = ratings_df[ratings_df['Company_lower']==company].sort_values(by = ['Date'], ascending = True).reset_index(drop = True)
        for action in range(len(current_ratings)):
            # Getting the needed information
            company = current_ratings.loc[action,'Company_lower']
            date = current_ratings.loc[action,'Date']
            rating = current_ratings.loc[action,'Rating']
             # We need to adress the part of the dataframe where the company is the same and the date higher
            df_rated.loc[(df_rated['difflib']==company)&(df_rated['Report Date']>=date),'Rating'] = rating

In [54]:
# We apply the ratings
assign_ratings()

In [55]:
# Checking results
df_rated.isna().sum()

Ticker                                                0
Company Name                                          0
IndustryId                                            0
Fiscal Year                                           0
Fiscal Period                                         0
Report Date                                           0
Shares (Diluted)                                      0
Cash, Cash Equivalents & Short Term Investments       0
Accounts & Notes Receivable                           0
Total Current Assets                                  0
Property, Plant & Equipment, Net                      0
Other Long Term Assets                                0
Total Noncurrent Assets                               0
Total Assets                                          0
Payables & Accruals                                   0
Total Current Liabilities                             0
Total Noncurrent Liabilities                          0
Total Liabilities                               

In [56]:
# We got almost everything rated
df_rated[df_rated['Rating'].isna()]

Unnamed: 0,Ticker,Company Name,IndustryId,Fiscal Year,Fiscal Period,Report Date,Shares (Diluted),"Cash, Cash Equivalents & Short Term Investments",Accounts & Notes Receivable,Total Current Assets,...,Net Cash from Acquisitions & Divestitures,Net Cash from Investing Activities,Cash from (Repayment of) Debt,Cash from (Repurchase of) Equity,Net Cash from Financing Activities,Net Change in Cash,Sector,Industry,difflib,Rating
15683,HQY,HEALTHEQUITY INC,101003.0,2016,Q4,2017-01-31,60453000.0,1.803590e+08,17001000.0,2.008190e+08,...,0.000000e+00,-3275000.0,0.0,2596000.0,3321000.0,14608000,Technology,Application Software,"healthequity, inc.",
20063,M,"Macy's, Inc.",103002.0,2016,Q4,2017-01-31,307800000.0,1.297000e+09,522000000.0,7.626000e+09,...,0.000000e+00,304000000.0,-628000000.0,-81000000.0,-957000000.0,840000000,Consumer Cyclical,Retail - Apparel & Specialty,"macy's, inc.",
33972,UNFI,UNITED NATURAL FOODS INC,102001.0,2017,Q2,2017-01-31,50755000.0,3.065800e+07,514870000.0,1.638398e+09,...,9.200000e+04,-15366000.0,-71933000.0,165000.0,-71791000.0,17102000,Consumer Defensive,Retail - Defensive,"united natural foods, inc",
7091,CLDR,"Cloudera, Inc.",101003.0,2016,Q4,2017-01-31,36848463.0,2.349560e+08,101549000.0,3.497020e+08,...,0.000000e+00,75626000.0,0.0,-1015000.0,-1015000.0,42738000,Technology,Application Software,"cloudera, inc",
18252,KMG,KMG CHEMICALS INC,110001.0,2017,Q2,2017-01-31,12293000.0,3.058700e+07,35309000.0,1.124910e+08,...,-1.280460e+08,-2676000.0,7700000.0,43909250.0,7124000.0,12981000,Basic Materials,Chemicals,"kmg chemicals, inc.",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5441,BXC,BlueLinx Holdings Inc.,110005.0,2021,Q2,2021-06-30,9858000.0,1.790000e+05,437217000.0,8.993900e+08,...,-4.886443e+07,-1488000.0,-38103000.0,0.0,-45721000.0,0,Basic Materials,Building Materials,bluelinx holdings inc.,
15701,HQY,HEALTHEQUITY INC,101003.0,2021,Q2,2021-07-31,83509000.0,7.537540e+08,74223000.0,8.606140e+08,...,-2.344000e+06,-20467000.0,0.0,2483000.0,200000.0,16981000,Technology,Application Software,"healthequity, inc.",
5442,BXC,BlueLinx Holdings Inc.,110005.0,2021,Q3,2021-09-30,9714000.0,1.860000e+05,344974000.0,8.204260e+08,...,-4.886443e+07,-1972000.0,-97300000.0,0.0,-102279000.0,7000,Basic Materials,Building Materials,bluelinx holdings inc.,
26520,PRO,"PROS Holdings, Inc.",101003.0,2021,Q3,2021-09-30,44318000.0,3.086420e+08,43125000.0,3.681300e+08,...,-2.482440e+07,-2516000.0,-288000.0,1515000.0,1227000.0,-9684000,Technology,Application Software,"prog holdings, inc.",


In [59]:
ratings_df[ratings_df['Company_lower']=="prog holdings, inc."]

Unnamed: 0,Company,Date,Rating,Company_lower
80887,"PROG HOLDINGS, INC.",2021-11-08,B1,"prog holdings, inc."


I looked through a few of them and realized, that the NaNs are created for the quarters where there is no rating available yet for the companies, so we drop those lines.

In [64]:
df_rated.shape

(7086, 37)

In [65]:
df_rated = df_rated.dropna(axis='rows')

In [66]:
df_rated.shape

(6079, 37)

## Saving our finalized dataframe

In [67]:
df_rated.to_csv('Prepared Frames/rated.csv', index=False)