# Merging the rating data with my dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from datetime import date

In [2]:
# For indexing the different files
from os import listdir
from os.path import isfile, join
import csv
import pprint
import pathlib
import collections

In [3]:
import time
from datetime import date

# Loading the credit rating actions:

#### Retrieving all the filepaths

In [4]:
# Path of the file directory
rating_directory = pathlib.Path.cwd().joinpath('Prepared Data')

In [5]:
# Gewtting list of all files
files_list = []
files_list = listdir(rating_directory)

In [6]:
rating_frames = []

In [7]:
# Getting all files as dataframes into a list
for file in files_list:
    filepath = open(rating_directory.joinpath(file))
    df = pd.read_csv(filepath)
    rating_frames.append(df)

In [8]:
# Merging all files
ratings_df = pd.concat(rating_frames, axis=0)

In [9]:
display(ratings_df.tail(20))
ratings_df.shape

Unnamed: 0,Company,Date,Rating
7,Texarkana College? TX,2021-08-05,WR
8,Missoula County Public School District 1? MT,2021-04-06,Aa3
9,Niles Township District for Special Education ...,2021-02-02,Aa3
10,Idaho Public Charter School Facilities Program,2021-03-10,Aa3
11,Mineral Area Community College District? MO (R...,2021-08-05,A2
12,Orangeburg County School District? SC,2021-03-19,A2
13,Bethlehem Area Vocational-Technical School? PA,2021-03-26,A1
14,York County School of Technology? PA,2021-03-18,A2
15,Hamilton? Fulton and Montgomery Counties Board...,2021-03-26,A1
16,North West Fire District? NY,2021-04-12,A1


(81592, 3)

In [10]:
# Changing back the question marks to commas
def convert(x):
    return x.replace('?',',')
ratings_df['Company'] = ratings_df['Company'].apply(convert)

In [11]:
# Converting the Date to datetime format
ratings_df['Date'] = pd.to_datetime(ratings_df['Date'])

## Data cleaning
Before we will take a look at the matches, we will clean up the other two columns of our dataframe.

#### Date column

In [18]:
# The good thing is, if there is anything but a date in the column, 
# the datetime function will throw an error.
ratings_df['Date'] = pd.to_datetime(ratings_df['Date'])

# Looks like the conversation here was successfull

#### Rating column
We have a look at the unique values in the rating column, those should ideally not be too many

In [24]:
len(ratings_df['Rating'].unique())
# I commented this out for not to bloat the notebook
# ratings_df['Rating'].unique()

67

The bad news: There are way too many unique values.
The good news: They all look like ratings, sometimes there are some values attached that do not belong there. We will try to get rid of them.

Most of the malformed rating follow the pattern, that they have a dot attached and then some characters.
We can help ourselfs here with a simple split function.

In [26]:
def remove_attach(rating):
    try:
        # We only take what is before the dot.
        res = rating.split('.')[0]
    except:
        # If there is no dot we leave the string as is.
        res = rating
    # Additionally we often see "(P)"  or "-PD" attached to our rating, we will get rid of that too.
    res = res.replace('(P)','')
    res = res.replace('-PD','')
    return res        

In [27]:
# We apply the cleaning function on our dataframe
ratings_df['Rating'] = ratings_df['Rating'].apply(remove_attach)
len(ratings_df['Rating'].unique())

55

In [29]:
# The rating that are relevant for us, since they are company credit ratings are now clean.
# We are talking about those, who Start with the letters A, B or C
ratings_df['Rating'].unique()

array(['Ba1', 'WR', 'A2', 'A3', 'Baa2', 'Ba3', 'Ba2', 'B1', 'Aa3', 'Baa1',
       'Baa3', 'NP', 'P-3', 'P-2', 'P-1', 'A1', 'Caa2', 'B2', 'Aa1',
       'Aa2', 'Aaa', 'B3', 'Caa1', 'Ca', 'Caa3', 'D', 'TR-1', 'C', 'MX-1',
       'BR-1', 'BR-2', 'BR-4', 'SA-2', 'SA-1', 'BR-3', 'LB-1', 'LB-2',
       'LB-3', 'LB-4', 'MA-1', 'BO-1', 'MX-2', 'MX-3', 'MX-4', 'BO-3',
       'BO-2', 'TR-2', 'TR-3', 'TR-4', 'CZ-4', 'NG-1', 'NG-2', 'NG-4',
       'KE-1', 'MIG 1'], dtype=object)

## Saving the cleaned Ratings Data

In [30]:
ratings_df.to_csv('Prepared Frames/rating_data.csv', index=False)