In [1]:
import csv
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.express as px
from datetime import datetime, timedelta
import pandas as pd
import numpy as np

DATACSV = "./data/Data.csv"

In [2]:

reader = pd.read_csv(DATACSV, chunksize=10 ** 6)

date_range = pd.date_range(start='2016-01-01', end='2019-01-01', freq='15D')

date = []
counts = np.zeros((len(date_range), 3))

for i, end_date in enumerate(date_range):
    date.append(end_date.to_pydatetime())

for chunk in reader:
    chunk['api_applications_created'] = pd.to_numeric(chunk['api_applications_created'], errors='coerce', downcast='integer')
    chunk['FemaleFlag'] = pd.to_numeric(chunk['FemaleFlag'], errors='coerce', downcast='integer')
    
    chunk["api_applications_created_date"] = pd.to_datetime(chunk["api_applications_created"], unit='s')

    for i, end_date in enumerate(date_range):
        start_date = end_date - pd.DateOffset(days=30*2) 

        filtered_chunk_total = chunk[(chunk['api_applications_created_date'] >= start_date) & (chunk['api_applications_created_date'] <= end_date)]
        filtered_chunk_women = chunk[(chunk['api_applications_created_date'] >= start_date) & (chunk['api_applications_created_date'] <= end_date) & (chunk['FemaleFlag'] == 1)]
        filtered_chunk_men = chunk[(chunk['api_applications_created_date'] >= start_date) & (chunk['api_applications_created_date'] <= end_date) & (chunk['FemaleFlag'] != 1)]

        application_count = len(filtered_chunk_total)
        women_count = len(filtered_chunk_women)
        men_count = len(filtered_chunk_men)

        counts[i][0] += application_count
        counts[i][1] += women_count
        counts[i][2] += men_count

ac_total, ac_women, ac_men = counts.T 


  for chunk in reader:


KeyboardInterrupt: 

In [20]:
correlation_matrix = np.corrcoef(counts, rowvar=False)

fig = go.Figure()
fig.add_trace(go.Scatter(x=date, y=ac_total, mode='lines+markers', name='Total'))
fig.add_trace(go.Scatter(x=date, y=ac_women, mode='lines+markers', name='Women'))
fig.add_trace(go.Scatter(x=date, y=ac_men, mode='lines+markers', name='Men'))
fig.update_layout(title='Applications in The Past Month', xaxis_title='Date', yaxis_title='Application Count')
fig.write_html("app_over_time.html")

fig = ff.create_annotated_heatmap(
    z=np.flipud(correlation_matrix),
    x=['Total', 'Women', 'Men'],
    y=['Men', 'Women', 'Total'],
    colorscale='Viridis'
)

# Update layout
fig.update_layout(
    title='Correlation of Female, Male, and Total Applications over time' ,
    xaxis_title='',
    yaxis_title=''
)

fig.write_html("app_corr.html")

In [3]:
reader = pd.read_csv(DATACSV, chunksize=10 ** 6)

dfs = []
max_id = 0

for chunk in reader:

    chunk["fid"] = pd.to_numeric(chunk['fid'], errors='coerce', downcast='integer')
    chunk["id"] = pd.to_numeric(chunk['id'], errors='coerce', downcast='integer')
    chunk["CalLetterFlag"] = pd.to_numeric(chunk['CalLetterFlag'], errors='coerce', downcast='integer')
    chunk["FemaleFlag"] = pd.to_numeric(chunk['FemaleFlag'], errors='coerce', downcast='integer')
    chunk_percentages = chunk.groupby('fid').agg(
        total_applications=pd.NamedAgg(column='id', aggfunc='count'),
        total_interviews=pd.NamedAgg(column='CalLetterFlag', aggfunc='sum'),
        gender_flag=pd.NamedAgg(column='FemaleFlag', aggfunc='first')
    )

    dfs.append(chunk_percentages)
    max_id = max(chunk["fid"].max(), max_id)

result = np.zeros((max_id + 1, 3))

print(max_id)

for chunk in dfs:
    for index, row in chunk.iterrows():
        result[index, 0] += row["total_applications"]
        result[index, 1] += row["total_interviews"]
        result[index, 2] = 2 if row["gender_flag"] else 1

result_df = pd.DataFrame(result, columns=["total_applications", "total_interviews", "gender_flag"])

print(result_df)

  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:


22013707
          total_applications  total_interviews  gender_flag
0                       54.0              53.0          1.0
1                        0.0               0.0          0.0
2                        0.0               0.0          0.0
3                        0.0               0.0          0.0
4                        0.0               0.0          0.0
...                      ...               ...          ...
22013703                 0.0               0.0          0.0
22013704                 0.0               0.0          0.0
22013705                 0.0               0.0          0.0
22013706                 0.0               0.0          0.0
22013707                 1.0               0.0          1.0

[22013708 rows x 3 columns]


In [4]:
result_df = result_df[(result_df["total_applications"] > 0.0) & (result_df["gender_flag"] != 0.0)]
# female = result_df[(result_df["total_applications"] > 0.0) & (result_df["gender_flag"] == 2.0)]

result_df["interview_percent"] = result_df["total_interviews"] / result_df["total_applications"]
# female["interview_percent"] = female["total_interviews"] / female["total_applications"]

male = result_df[(result_df["gender_flag"] == 1) & (result_df["total_applications"] < 100) & (result_df["total_interviews"] < 20)]
female = result_df[(result_df["gender_flag"] == 2) & (result_df["total_applications"] < 100) & (result_df["total_interviews"] < 20)]

fig = go.Figure()
fig.add_trace(go.Histogram(x=male["total_interviews"], name='Male', opacity=0.7))
fig.add_trace(go.Histogram(x=female["total_interviews"], name='Female', opacity=0.7))
fig.update_layout(title='Histogram of the Total Interviews Recieved for Male and Female',
                  xaxis_title='Total Interviews Recieved',
                  yaxis_title='Frequency',
                  barmode='overlay',  # Overlay histograms for better comparison
                  legend=dict(title='Gender'))

fig.write_html("interview_hist.html")

fig = go.Figure()
fig.add_trace(go.Histogram(x=male["total_applications"], name='Male', opacity=0.7))
fig.add_trace(go.Histogram(x=female["total_applications"], name='Female', opacity=0.7))
fig.update_layout(title='Histogram of the Total Applications Submitted for Male and Female',
                  xaxis_title='Total Applications Submitted',
                  yaxis_title='Frequency',
                  barmode='overlay',  # Overlay histograms for better comparison
                  legend=dict(title='Gender'))

fig.write_html("application_hist.html")

fig = go.Figure()
fig.add_trace(go.Histogram(x=male["interview_percent"], name='Male', opacity=0.7))
fig.add_trace(go.Histogram(x=female["total_applications"], name='Female', opacity=0.7))
fig.update_layout(title='Histogram of the Probility of Recieving an Interview for Male and Female',
                  xaxis_title='Probility of Recieving andInterview',
                  yaxis_title='Frequency',
                  barmode='overlay',  # Overlay histograms for better comparison
                  legend=dict(title='Gender'))

fig.write_html("percent_hist.html")

# result_df['prob_at_least_one_interview'] = 1 - (1 - (result_df['total_interviews'] / result_df['total_applications'])) ** result_df['total_applications']

# fig = px.scatter(result_df, x='total_applications', y='prob_at_least_one_interview',
#                    labels={'prob_at_least_one_interview': 'Probability of At Least One Interview'},
#                    title='Probability of Getting At Least One Interview vs Total Applications')
# # Show the plot
# fig.write_html("prob_interview.html")

# print(result_df, result_df["total_interviews"].min())

In [13]:
import pandas as pd
import plotly.graph_objects as go

df = pd.read_csv("https://gist.githubusercontent.com/jbrobst/56c13bbbf9d97d187fea01ca62ea5112/raw/e388c4cae20aa53cb5090210a42ebb9b765c0a36/active_cases_2020-07-17_0800.csv")

print(df)

fig = go.Figure(data=go.Choropleth(
    geojson="https://gist.githubusercontent.com/jbrobst/56c13bbbf9d97d187fea01ca62ea5112/raw/e388c4cae20aa53cb5090210a42ebb9b765c0a36/india_states.geojson",
    featureidkey='properties.ST_NM',
    locationmode='geojson-id',
    locations=df['state'],
    z=df['active cases'],

    autocolorscale=False,
    colorscale='Reds',
    marker_line_color='peachpuff',

    colorbar=dict(
        title={'text': "Active Cases"},

        thickness=15,
        len=0.35,
        bgcolor='rgba(255,255,255,0.6)',

        tick0=0,
        dtick=20000,

        xanchor='left',
        x=0.01,
        yanchor='bottom',
        y=0.05
    )
))

fig.update_geos(
    visible=False,
    projection=dict(
        type='conic conformal',
        parallels=[12.472944444, 35.172805555556],
        rotation={'lat': 24, 'lon': 80}
    ),
    lonaxis={'range': [68, 98]},
    lataxis={'range': [6, 38]}
)

fig.update_layout(
    title=dict(
        text="Active COVID-19 Cases in India by State as of July 17, 2020",
        xanchor='center',
        x=0.5,
        yref='paper',
        yanchor='bottom',
        y=1,
        pad={'b': 10}
    ),
    margin={'r': 0, 't': 30, 'l': 0, 'b': 0},
    # height=550,
    # width=550
)

fig.write_html("india.html")

                                       state  active cases
0                          Andaman & Nicobar            47
1                             Andhra Pradesh         18159
2                          Arunachal Pradesh           387
3                                      Assam          6818
4                                      Bihar          7549
5                                 Chandigarh           164
6                               Chhattisgarh          1260
7   Dadra and Nagar Haveli and Daman and Diu           179
8                                      Delhi         17407
9                                        Goa          1272
10                                   Gujarat         11289
11                                   Haryana          5495
12                          Himachal Pradesh           382
13                           Jammu & Kashmir          5488
14                                 Jharkhand          2069
15                                 Karnataka         306

In [14]:
#4.1 - 95.9 / 97.3 - 2.7
#2.1        /  3

# F 5.3 -> 2.1
# M 8.1 -> 10.3

df = px.data.tips()
print(df)

df = pd.DataFrame([["Female", .42], ["Male", .48]])
print(df)

fig = px.pie(df, values=1, names=0, title='Interviews Recieved before March of 2017')
fig.write_html("pie_before.html")

df = pd.DataFrame([["Female", .169], ["Male", .83]])

fig = px.pie(df, values=1, names=0, title='Interviews Recieved after March of 2017')
fig.write_html("pie_after.html")

     total_bill   tip     sex smoker   day    time  size
0         16.99  1.01  Female     No   Sun  Dinner     2
1         10.34  1.66    Male     No   Sun  Dinner     3
2         21.01  3.50    Male     No   Sun  Dinner     3
3         23.68  3.31    Male     No   Sun  Dinner     2
4         24.59  3.61  Female     No   Sun  Dinner     4
..          ...   ...     ...    ...   ...     ...   ...
239       29.03  5.92    Male     No   Sat  Dinner     3
240       27.18  2.00  Female    Yes   Sat  Dinner     2
241       22.67  2.00    Male    Yes   Sat  Dinner     2
242       17.82  1.75    Male     No   Sat  Dinner     2
243       18.78  3.00  Female     No  Thur  Dinner     2

[244 rows x 7 columns]
        0     1
0  Female  0.39
1    Male  0.61
