In [22]:
import pandas as pd
import numpy as np
pro_df = pd.read_csv('linkedin_projects.csv')
emp_pro_df = pd.read_csv('linkedin_emp_projects.csv')
emp_df = pd.read_csv('linkedin_employees.csv')

Identify projects that are at risk for going overbudget. A project is considered to be overbudget if the cost of all employees assigned to the project is greater than the budget of the project.


You'll need to prorate the cost of the employees to the duration of the project. For example, if the budget for a project that takes half a year to complete is 10K dollars, then the total half-year salary of all employees assigned to the project should not exceed $10K. Salary is defined on a yearly basis, so be careful how to calculate salaries for the projects that last less or more than one year.


Output a list of projects that are overbudget with their project name, project budget, and prorated total employee expense (rounded to the next dollar amount).



In [75]:
# adding a salary_per_day of each employee into employee table
emp_df['salary_per_day'] = emp_df['salary']/365
emp_df

Unnamed: 0,id,first_name,last_name,salary,salary_per_day
0,10592,Jennifer,Roberts,20204,55.353425
1,10593,Haley,Ho,33154,90.832877
2,10594,Eric,Mccarthy,32360,88.657534
3,10595,Gina,Martinez,46388,127.090411
4,10596,Jason,Fields,12348,33.830137
...,...,...,...,...,...
95,10687,Angela,Smith,20019,54.846575
96,10688,Patrick,Gonzalez,11482,31.457534
97,10689,Gabrielle,Velez,36584,100.230137
98,10690,Catherine,Luna,39237,107.498630


### Finding total days to finish each projects

In [78]:
# how to change multiples columns from object into datetime type 
emp_pro_df[['end_date','start_date']] = emp_pro_df[['end_date','start_date']].apply(pd.to_datetime)

In [108]:
# calculate the total days of each project and change into days type
emp_pro_df['total_days'] = (emp_pro_df['end_date'] - emp_pro_df['start_date']).dt.days
emp_pro_df

Unnamed: 0,id,title,budget,start_date,end_date,total_days
0,1,Project1,29498,2018-08-31,2019-03-13,194
1,2,Project2,32487,2018-01-27,2018-12-13,320
2,3,Project3,43909,2019-11-05,2019-12-09,34
3,4,Project4,15776,2018-06-28,2018-11-20,145
4,5,Project5,36268,2019-03-13,2020-01-02,295
5,6,Project6,41611,2018-09-18,2019-08-28,344
6,7,Project7,34003,2020-05-28,2020-10-01,126
7,8,Project8,49284,2019-12-18,2020-04-18,122
8,9,Project9,32341,2018-05-24,2019-05-11,352
9,10,Project10,47587,2018-06-24,2018-11-19,148


### Finding specific employees to finish each projects

In [113]:
# merge project table with employee-project table
pd.merge(pro_df,emp_pro_df,left_on='project_id',right_on='id').drop(columns=['start_date','end_date','id'])
q = pd.merge(pro_df,emp_pro_df,left_on='project_id',right_on='id').drop(columns=['start_date','end_date','id'])
q

Unnamed: 0,emp_id,project_id,title,budget,total_days
0,10592,1,Project1,29498,194
1,10642,1,Project1,29498,194
2,10593,2,Project2,32487,320
3,10643,2,Project2,32487,320
4,10594,3,Project3,43909,34
...,...,...,...,...,...
95,10689,48,Project48,41628,179
96,10640,49,Project49,48774,208
97,10690,49,Project49,48774,208
98,10641,50,Project50,18915,173


### Finding specific salary of those employees

In [114]:
# merge with employee table to find which employees are handling each projects
pd.merge(q,emp_df,left_on='emp_id',right_on='id').drop(columns=['id'])
q = pd.merge(q,emp_df,left_on='emp_id',right_on='id').drop(columns=['id'])
q

Unnamed: 0,emp_id,project_id,title,budget,total_days,first_name,last_name,salary,salary_per_day
0,10592,1,Project1,29498,194,Jennifer,Roberts,20204,55.353425
1,10642,1,Project1,29498,194,Joshua,Salinas,48079,131.723288
2,10593,2,Project2,32487,320,Haley,Ho,33154,90.832877
3,10643,2,Project2,32487,320,Sarah,Briggs,27150,74.383562
4,10594,3,Project3,43909,34,Eric,Mccarthy,32360,88.657534
...,...,...,...,...,...,...,...,...,...
95,10689,48,Project48,41628,179,Gabrielle,Velez,36584,100.230137
96,10640,49,Project49,48774,208,Alan,Miller,12724,34.860274
97,10690,49,Project49,48774,208,Catherine,Luna,39237,107.498630
98,10641,50,Project50,18915,173,Brandy,Ellison,30396,83.276712


In [120]:
# (rounded to the next dollar amount)
q['total_salary'] = (q['total_days']*q['salary_per_day']).round(0).astype(int)
q

Unnamed: 0,emp_id,project_id,title,budget,total_days,first_name,last_name,salary,salary_per_day,total_salary
0,10592,1,Project1,29498,194,Jennifer,Roberts,20204,55.353425,10739
1,10642,1,Project1,29498,194,Joshua,Salinas,48079,131.723288,25554
2,10593,2,Project2,32487,320,Haley,Ho,33154,90.832877,29067
3,10643,2,Project2,32487,320,Sarah,Briggs,27150,74.383562,23803
4,10594,3,Project3,43909,34,Eric,Mccarthy,32360,88.657534,3014
...,...,...,...,...,...,...,...,...,...,...
95,10689,48,Project48,41628,179,Gabrielle,Velez,36584,100.230137,17941
96,10640,49,Project49,48774,208,Alan,Miller,12724,34.860274,7251
97,10690,49,Project49,48774,208,Catherine,Luna,39237,107.498630,22360
98,10641,50,Project50,18915,173,Brandy,Ellison,30396,83.276712,14407


In [122]:
# groupby 'project_id', 'title', 'budget'
q.groupby(['project_id','title','budget']).sum()['total_salary'].reset_index()
q = q.groupby(['project_id','title','budget']).sum()['total_salary'].reset_index()
q

Unnamed: 0,project_id,title,budget,total_salary
0,1,Project1,29498,36293
1,2,Project2,32487,52870
2,3,Project3,43909,7299
3,4,Project4,15776,30655
4,5,Project5,36268,24043
5,6,Project6,41611,63229
6,7,Project7,34003,22305
7,8,Project8,49284,16231
8,9,Project9,32341,44690
9,10,Project10,47587,26206


In [123]:
# add a 'over_budget' column and filter the table to find out the projects are overbudget
q['over_budget'] = q['total_salary'] - q['budget']
q[q['over_budget'] >=0].sort_values(by='over_budget',ascending=False)

Unnamed: 0,project_id,title,budget,total_salary,over_budget
23,24,Project24,11918,74665,62747
31,32,Project32,12356,66523,54167
36,37,Project37,8806,61949,53143
11,12,Project12,10468,62843,52375
25,26,Project26,36190,79368,43178
28,29,Project29,10935,48371,37436
19,20,Project20,19497,55961,36464
17,18,Project18,10302,46381,36079
20,21,Project21,24330,57309,32979
45,46,Project46,9824,42314,32490
