# We match our rating data to the main dataset

## Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from datetime import date

## Loading data

In [2]:
ratings_df = pd.read_csv('Prepared Frames/rating_data.csv')
df_companies = pd.read_csv('Prepared Frames/companies_approx_match.csv')
main_df = pd.read_csv('Prepared Frames/main_data.csv')

## Merging the found matches to the main Dataframe

In [3]:
main_df = pd.merge(left = main_df,
                                 right = df_companies[['Ticker','difflib']],
                                 # We use outer to not loose Information, we will have to get rid of some NaNs later
                                 how = 'left', 
                                 left_on = ['Ticker'], 
                                 right_on= ['Ticker'])

## Assigning the ratings
We now have to assign the ratings. The difficulty is, that we don't have the exact dates to match, but have to match when the date in the main dataframe is greater than the one for the rating and have to overwrite it with a new rating in case there is one.

In [4]:
# We can only assign ratings where we have found the companies, 
# we create a new dataframe just with these entries
df_rated = main_df[main_df['difflib'].isna()==False].copy()

In [5]:
# We sort the dataframe by dates
df_rated = df_rated.sort_values(by = ['Report Date'], ascending = True)
df_rated.head(3)

Unnamed: 0,Ticker,Company Name,IndustryId,Fiscal Year,Fiscal Period,Report Date,Shares (Diluted),"Cash, Cash Equivalents & Short Term Investments",Accounts & Notes Receivable,Total Current Assets,...,Abnormal Gains (Losses),Pretax Income (Loss),"Income Tax (Expense) Benefit, Net",Income (Loss) from Continuing Operations,Net Extraordinary Gains (Losses),Net Income,Net Income (Common),Sector,Industry,difflib
18103,KEYS,"Keysight Technologies, Inc.",101001.0,2017,Q1,2017-01-31,173000000.0,896000000.0,395000000.0,1932000000.0,...,0.0,152000000.0,-43000000.0,109000000.0,0.0,109000000.0,109000000.0,Technology,Computer Hardware,"keysight technologies, inc."
17887,JWN,NORDSTROM INC,103002.0,2016,Q4,2017-01-31,178500000.0,1007000000.0,199000000.0,3242000000.0,...,0.0,393000000.0,-192000000.0,201000000.0,0.0,201000000.0,201000000.0,Consumer Cyclical,Retail - Apparel & Specialty,"nordstrom, inc."
5334,BURL,"Burlington Stores, Inc.",102001.0,2016,Q4,2017-01-31,70877000.0,81597000.0,43252000.0,928324000.0,...,0.0,190532000.0,-64971000.0,125561000.0,0.0,125561000.0,125561000.0,Consumer Defensive,Retail - Defensive,"burlington stores, inc."


In [6]:
# Since we matched the names of the companies on the lowercase version, we have to create a column  with that version again
ratings_df['Company_lower'] = ratings_df['Company'].apply(lambda x: x.lower())

In [7]:
# We create a list of all unique companies in our ratings dataframe
companies = list(df_rated['difflib'].unique())

#### Checking if we can match right

In [8]:
df_rated[df_rated['difflib']==companies[0]].head(2)

Unnamed: 0,Ticker,Company Name,IndustryId,Fiscal Year,Fiscal Period,Report Date,Shares (Diluted),"Cash, Cash Equivalents & Short Term Investments",Accounts & Notes Receivable,Total Current Assets,...,Abnormal Gains (Losses),Pretax Income (Loss),"Income Tax (Expense) Benefit, Net",Income (Loss) from Continuing Operations,Net Extraordinary Gains (Losses),Net Income,Net Income (Common),Sector,Industry,difflib
18103,KEYS,"Keysight Technologies, Inc.",101001.0,2017,Q1,2017-01-31,173000000.0,896000000.0,395000000.0,1932000000.0,...,0.0,152000000.0,-43000000.0,109000000.0,0.0,109000000.0,109000000.0,Technology,Computer Hardware,"keysight technologies, inc."
18104,KEYS,"Keysight Technologies, Inc.",101001.0,2017,Q2,2017-04-30,179000000.0,1023000000.0,518000000.0,2314000000.0,...,0.0,22000000.0,27000000.0,49000000.0,0.0,49000000.0,49000000.0,Technology,Computer Hardware,"keysight technologies, inc."


In [9]:
ratings_df[ratings_df['Company_lower']==companies[0]]

Unnamed: 0,Company,Date,Rating,Company_lower
49310,"Keysight Technologies, Inc.",2014-09-12,Baa3,"keysight technologies, inc."
49311,"Keysight Technologies, Inc.",2019-08-14,Baa2,"keysight technologies, inc."


In [10]:
ratings_df.columns

Index(['Company', 'Date', 'Rating', 'Company_lower'], dtype='object')

#### We have to make sure to overwrite only the greater dates with updated rating information
We will test it manually for the first company and do the rest with a function

In [11]:
# First create a sub-dataframe only with the ratings we are interested in
current_ratings = ratings_df[ratings_df['Company_lower']==companies[0]].sort_values(by = ['Date'], ascending = True).reset_index(drop = True)
current_ratings

Unnamed: 0,Company,Date,Rating,Company_lower
0,"Keysight Technologies, Inc.",2014-09-12,Baa3,"keysight technologies, inc."
1,"Keysight Technologies, Inc.",2019-08-14,Baa2,"keysight technologies, inc."


In [12]:
for action in range(len(current_ratings)):
    # Getting the needed information
    company = current_ratings.loc[action,'Company_lower']
    date = current_ratings.loc[action,'Date']
    rating = current_ratings.loc[action,'Rating']
    
    # We need to adress the part of the dataframe where the company is the same and the date higher
    df_rated.loc[(df_rated['difflib']==company)&(df_rated['Report Date']>=date),'Rating'] = rating

In [13]:
# Checking the result
# df_rated[df_rated['difflib']==companies[0]]

# That worked correctly, we can apply it to our whole dataframe

In [14]:
def assign_ratings():
    # List of companies 
    companies = list(df_rated['difflib'].unique())
    for company in companies:
        # Creating the subframe
        current_ratings = ratings_df[ratings_df['Company_lower']==company].sort_values(by = ['Date'], ascending = True).reset_index(drop = True)
        for action in range(len(current_ratings)):
            # Getting the needed information
            company = current_ratings.loc[action,'Company_lower']
            date = current_ratings.loc[action,'Date']
            rating = current_ratings.loc[action,'Rating']
             # We need to adress the part of the dataframe where the company is the same and the date higher
            df_rated.loc[(df_rated['difflib']==company)&(df_rated['Report Date']>=date),'Rating'] = rating

In [15]:
# We apply the ratings
assign_ratings()

In [16]:
# Checking results
df_rated.isna().sum()

Ticker                                                0
Company Name                                          0
IndustryId                                            0
Fiscal Year                                           0
Fiscal Period                                         0
Report Date                                          95
Shares (Diluted)                                     95
Cash, Cash Equivalents & Short Term Investments      95
Accounts & Notes Receivable                          95
Total Current Assets                                 95
Property, Plant & Equipment, Net                     95
Other Long Term Assets                               95
Total Noncurrent Assets                              95
Total Assets                                         95
Payables & Accruals                                  95
Total Current Liabilities                            95
Total Noncurrent Liabilities                         95
Total Liabilities                               

In [17]:
# We got almost everything rated
df_rated[df_rated['Rating'].isna()]

Unnamed: 0,Ticker,Company Name,IndustryId,Fiscal Year,Fiscal Period,Report Date,Shares (Diluted),"Cash, Cash Equivalents & Short Term Investments",Accounts & Notes Receivable,Total Current Assets,...,Pretax Income (Loss),"Income Tax (Expense) Benefit, Net",Income (Loss) from Continuing Operations,Net Extraordinary Gains (Losses),Net Income,Net Income (Common),Sector,Industry,difflib,Rating
18252,KMG,KMG CHEMICALS INC,110001.0,2017,Q2,2017-01-31,12293000.0,3.058700e+07,35309000.0,1.124910e+08,...,8.582000e+06,-2.097000e+06,6.485000e+06,0.0,6.485000e+06,6.485000e+06,Basic Materials,Chemicals,"kmg chemicals, inc.",
7091,CLDR,"Cloudera, Inc.",101003.0,2016,Q4,2017-01-31,36848463.0,2.349560e+08,101549000.0,3.497020e+08,...,-3.173900e+07,-7.610000e+05,-3.250000e+07,0.0,-3.250000e+07,-3.250000e+07,Technology,Application Software,"cloudera, inc",
21862,MRVL,MARVELL TECHNOLOGY GROUP LTD,101004.0,2016,Q4,2017-01-31,520623000.0,1.668360e+09,335384000.0,2.290434e+09,...,4.819000e+06,-6.834500e+07,-6.352600e+07,0.0,-8.009100e+07,-8.009100e+07,Technology,Semiconductors,marvell technology group ltd.,
33972,UNFI,UNITED NATURAL FOODS INC,102001.0,2017,Q2,2017-01-31,50755000.0,3.065800e+07,514870000.0,1.638398e+09,...,4.202800e+07,-1.654600e+07,2.548200e+07,0.0,2.548200e+07,2.548200e+07,Consumer Defensive,Retail - Defensive,"united natural foods, inc",
20063,M,"Macy's, Inc.",103002.0,2016,Q4,2017-01-31,307800000.0,1.297000e+09,522000000.0,7.626000e+09,...,7.280000e+08,-2.560000e+08,4.720000e+08,0.0,4.750000e+08,4.750000e+08,Consumer Cyclical,Retail - Apparel & Specialty,"macy's, inc.",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37798,UNFI,UNITED NATURAL FOODS INC,102001.0,2022,Q2,,,,,,...,9.300000e+07,-2.500000e+07,6.800000e+07,0.0,6.600000e+07,6.600000e+07,Consumer Defensive,Retail - Defensive,"united natural foods, inc",
37811,VRNT,VERINT SYSTEMS INC,101003.0,2021,Q4,,,,,,...,6.115000e+06,-1.037500e+07,-4.260000e+06,0.0,-4.623000e+06,-9.823000e+06,Technology,Application Software,verint systems inc.,
37812,VSCO,Victoria's Secret & Co.,103004.0,2021,Q4,,,,,,...,3.220000e+08,-7.600000e+07,2.460000e+08,0.0,2.460000e+08,2.460000e+08,Consumer Cyclical,Manufacturing - Apparel & Furniture,victoria's secret & co.,
37821,WMT,Walmart Inc,102001.0,2021,Q4,,,,,,...,4.782000e+09,-1.149000e+09,3.633000e+09,0.0,3.562000e+09,3.562000e+09,Consumer Defensive,Retail - Defensive,walmart inc.,


In [18]:
ratings_df[ratings_df['Company_lower']=="prog holdings, inc."]

Unnamed: 0,Company,Date,Rating,Company_lower
59715,"PROG HOLDINGS, INC.",2021-11-08,B1,"prog holdings, inc."


I looked through a few of them and realized, that the NaNs are created for the quarters where there is no rating available yet for the companies, so we drop those lines.

In [19]:
df_rated.shape

(7181, 41)

In [20]:
df_rated = df_rated.dropna(axis='rows')

In [21]:
df_rated.shape

(5415, 41)

In [23]:
# By doing this, all other NaNs vanished also
df_rated.isna().sum()

Ticker                                             0
Company Name                                       0
IndustryId                                         0
Fiscal Year                                        0
Fiscal Period                                      0
Report Date                                        0
Shares (Diluted)                                   0
Cash, Cash Equivalents & Short Term Investments    0
Accounts & Notes Receivable                        0
Total Current Assets                               0
Property, Plant & Equipment, Net                   0
Other Long Term Assets                             0
Total Noncurrent Assets                            0
Total Assets                                       0
Payables & Accruals                                0
Total Current Liabilities                          0
Total Noncurrent Liabilities                       0
Total Liabilities                                  0
Share Capital & Additional Paid-In Capital    

## Saving our finalized dataframe

In [22]:
df_rated.to_csv('Prepared Frames/rated.csv', index=False)