<a href="https://colab.research.google.com/github/IAMDSVSSANGRAL/Datasets/blob/main/MyUSABANK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import pandas as pd
import numpy as np
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay

# Set the random seed for reproducibility
np.random.seed(42)

# Generate date range for 2 years on business days
start_date = '2022-01-01'
end_date = '2023-12-31'

# Create custom business day with US Federal holidays
us_calendar = CustomBusinessDay(calendar=USFederalHolidayCalendar())
dates = pd.date_range(start=start_date, end=end_date, freq=us_calendar)

# Number of business days
num_points = len(dates)

# Generate fake data with realistic ranges, no decimal values
data = {
    'Date': dates,
    'Interest_Income': np.random.randint(2000000, 3000000, num_points),
    'Interest_Expense': np.random.randint(500000, 1000000, num_points),
    'Average_Earning_Assets': np.random.randint(50000000, 60000000, num_points),
    'Net_Income': np.random.randint(1000000, 2000000, num_points),
    'Total_Assets': np.random.randint(80000000, 100000000, num_points),
    'Shareholder_Equity': np.random.randint(20000000, 30000000, num_points),
    'Operating_Expenses': np.random.randint(700000, 1500000, num_points),
    'Operating_Income': np.random.randint(3000000, 5000000, num_points),
    'Market_Share': np.random.randint(10, 30, num_points),  # Market Share in percentage
    'Stock_Price': np.random.randint(50, 200, num_points)  # Stock Price in dollars
}

# Create DataFrame
df = pd.DataFrame(data)

# Introduce some missing values
num_missing = int(num_points * 0.1)  # 10% missing values
missing_indices = np.random.choice(num_points, num_missing, replace=False)
for col in ['Interest_Income', 'Net_Income']:
    df.loc[missing_indices, col] = np.nan

# Introduce some duplicate values
duplicate_indices = np.random.choice(num_points, int(num_points * 0.05), replace=False)
df_duplicates = df.iloc[duplicate_indices]
df = pd.concat([df, df_duplicates])

# Introduce some outliers
outliers_indices = np.random.choice(num_points, int(num_points * 0.05), replace=False)
df.loc[outliers_indices, 'Interest_Expense'] *= 3
df.loc[outliers_indices, 'Net_Income'] *= 2

# Save to CSV for use in Tableau
df.to_csv('myusabank.csv', index=False)

# Display the first few rows of the DataFrame
print(df.head(20))

         Date  Interest_Income  Interest_Expense  Average_Earning_Assets  \
0  2022-01-03        2121958.0            773237                55001045   
1  2022-01-04        2671155.0            518070                54463418   
2  2022-01-05        2131932.0            797921                59771802   
3  2022-01-06        2365838.0            556958                54345760   
4  2022-01-07        2259178.0           1746222                57153768   
5  2022-01-10        2644167.0            510729                53251298   
6  2022-01-11        2110268.0            730785                54465150   
7  2022-01-12        2732180.0            676089                58389012   
8  2022-01-13        2054886.0            697392                57249903   
9  2022-01-14        2137337.0            658823                54897590   
10 2022-01-18        2999890.0            999616                56189573   
11 2022-01-19        2521430.0            605864                51323483   
12 2022-01-2

In [17]:
df.head()

Unnamed: 0,Date,Interest_Income,Interest_Expense,Average_Earning_Assets,Net_Income,Total_Assets,Shareholder_Equity,Operating_Expenses,Operating_Income,Market_Share,Stock_Price
0,2022-01-03,2121958.0,773237,55001045,1572212.0,95404302,23496605,811672,3494178,24,128
1,2022-01-04,2671155.0,518070,54463418,1336326.0,86440010,24948600,1030817,3231496,20,63
2,2022-01-05,2131932.0,797921,59771802,1224137.0,88674163,26416438,1345961,3692148,22,196
3,2022-01-06,2365838.0,556958,54345760,1452268.0,97221407,29694095,1289921,4779685,14,177
4,2022-01-07,2259178.0,1746222,57153768,3858336.0,98279553,25311499,1432303,4764985,10,103


In [18]:
df.shape

(523, 11)