## Predict The Company size that it's expected for student to join after graduate 

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# use random forest to predict
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error



In [7]:
sns.set_style(
    style='darkgrid', 
    rc={'axes.facecolor': '.9', 'grid.color': '.8'}
)
sns.set_palette(palette='deep')
sns_c = sns.color_palette(palette='deep')

## Preprocessing Data

In [20]:
students_licenses = pd.read_csv('../data/preprocessed-data/csv/licenses.csv')

students_licenses.head()

# drop na in Title column
students_licenses.dropna(subset=['Title'], inplace=True)


In [36]:
students_education = pd.read_csv('../data/preprocessed-data/csv/educations.csv')

students_education.head()

Unnamed: 0,University,Degree,Date,User
0,Information Technology Institute (ITI),Nov 2021 - Jun 2022,,%D9%90%D9%90amiraelmergawy
1,Ain Shams University,"Bachelor's degree, Computer Science",2016 - 2020,%D9%90%D9%90amiraelmergawy
2,Information Technology Institute (ITI),"Internship, Web Development using MEARN Stack",Dec 2021,-ezz
3,Al-Azhar University,Geophysics and Seismology,Sep 2015 - Aug 2019,-ezz
4,"Faculty of Petroleum and Mining Engineering, S...","Bachelor's degree, Petroleum Engineering",2015 - 2020,1212mohamedtaha


In [37]:
students_education.dropna(subset=['University'], inplace=True)

# make all university names lowercase
students_education['University'] = students_education['University'].str.lower()

# remove udacity udemy coursera 
students_education = students_education[~students_education['University'].str.contains('udacity|udemy|coursera|information technology institute|\(iti\)')]

# get students whose university is in the top 10
top_10_universities = students_education['University'].value_counts().head(10).index

print(top_10_universities)

students_education = students_education[students_education['University'].isin(top_10_universities)]

students_education.head()

Index(['cairo university', 'ain shams university', 'alexandria university',
       'mansoura university', 'helwan university cairo',
       'the german university in cairo',
       'arab academy for science, technology and maritime transport',
       'zagazig university', 'tanta university',
       'the british university in egypt'],
      dtype='object')


Unnamed: 0,University,Degree,Date,User
1,ain shams university,"Bachelor's degree, Computer Science",2016 - 2020,%D9%90%D9%90amiraelmergawy
8,ain shams university,"Bachelor of Engineering (B.Eng.), Computers an...",2013 - 2018,a-mohsen
11,cairo university,"Bachelor's degree, Communications and Computer...",2014 - 2019,a-youssry
13,cairo university,"Bachelor of Engineering - BE, Computer Enginee...",2017 - 2022,aaarafat
15,cairo university,"Good, Electronic and Communication",2002 - 2007,aabdelsattaar


In [39]:
companies = pd.read_csv('../data/companies/companies.csv')

# remove na companySize
companies.dropna(subset=['companySize'], inplace=True)

# select only companySize column
companies = companies[['companySize','name','query']]
companies.head()

Unnamed: 0,companySize,name,query
0,51-200 employees,National Telecommunication Institute (NTI),https://www.linkedin.com/company/15805726/
2,11-50 employees,(ISCC) International Staffing & Consulting Co.,https://www.linkedin.com/company/2205041/
3,11-50 employees,sonono,https://www.linkedin.com/company/30101254/
4,"1,001-5,000 employees",Etisalat Egypt,https://www.linkedin.com/company/777868/
5,11-50 employees,"The Translation Gate, LLC",https://www.linkedin.com/company/2229028/


In [52]:
students_experiences = pd.read_csv('../data/preprocessed-data/csv/experiences.csv')

# collect all rows of same User in one row
students_experiences = students_experiences.groupby('User').agg(lambda x: x.tolist())

# select only User, Title, Company, From,

# select only User, Title, Company
# students_experiences = students_experiences[['User','Title','Company']]

students_experiences.head()

Unnamed: 0_level_0,CompanyUrl,Company,Title,EmploymentType,From,To,Duration,Location
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
%D9%90%D9%90amiraelmergawy,"[https://www.linkedin.com/company/294614/, htt...","[Information Technology Institute (ITI), Natio...","[Teaching Assistant, MEAN Stack Developer, Sof...","[Full-time, Internship, Internship, Internship]","[Aug 2022, Nov 2020, Jul 2019, Aug 2018]","[Present, Jan 2021, nan, nan]","[9.0, 3.0, 1.0, 1.0]","[Egypt, nan, Cairo, Egypt, Cairo, Egypt]"
-ezz,"[https://www.linkedin.com/company/30101254/, h...","[sonono, Information Technology Institute (ITI...","[Software Engineer, Internship Trainee, Senior...","[Full-time, Full-time, Full-time]","[Jan 2022, Oct 2021, Sep 2019]","[Present, Jan 2022, Aug 2020]","[16.0, 4.0, 12.0]","[Basel, Switzerland, Egypt, Egypt]"
1212mohamedtaha,"[https://www.linkedin.com/company/777868/, htt...","[Etisalat Misr, Information Technology Institu...","[Big Data Engineer, Student Trainee, English /...","[Full-time, Full-time, Freelance]","[May 2022, Apr 2021, Jan 2021]","[Present, Feb 2022, Mar 2021]","[12.0, 11.0, 3.0]","[New Cairo, Cairo, Egypt, El Mansoura, Ad Daqa..."
3omarbadr,"[https://www.linkedin.com/company/18899346/, h...",[ACME SAICO - Integrated Engineering Systems -...,"[Software Engineer, Software Engineer, Softwar...","[Full-time, Part-time, Full-time, Internship, ...","[Sep 2022, Sep 2022, Jan 2022, Oct 2021, Jan 2...","[Present, Present, Aug 2022, Dec 2021, nan]","[8.0, 8.0, 8.0, 3.0, 1.0]","[Heliopolis, Cairo, Egypt, Amman, Jordan, Alex..."
a-mohsen,"[https://www.linkedin.com/company/10590015/, h...","[ItsaCheckmate, ItsaCheckmate, ItsaCheckmate, ...","[Software Tech Lead, Senior Software Engineer,...","[nan, nan, nan, Freelance, Full-time, Part-tim...","[Aug 2022, May 2021, May 2020, Aug 2018, May 2...","[Present, Aug 2022, May 2021, Present, Aug 202...","[9.0, 16.0, 13.0, 57.0, 16.0, 3.0, 6.0, 3.0]","[nan, nan, nan, nan, Egypt, Cairo Governorate,..."
