In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
import plotly.express as px 
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
layoff_df = pd.read_csv("../data/layoffs.csv")
layoff_df

In [None]:
layoff_df.info

In [None]:
print(layoff_df.dtypes)

In [None]:
layoff_df.isnull().sum()

# Exploring Industry Data

In [None]:
industry = pd.DataFrame(layoff_df.industry.value_counts().reset_index())
industry

In [None]:
industry.columns = ['Industry', 'Companies']
industry

In [None]:
industry.Companies.nlargest(5)

In [None]:
topind_layoffs = industry[industry['Industry'].isin(['Finance', 'Retail', 'Healthcare','Transportation', 'Marketing'])]

In [None]:
plt.bar(topind_layoffs.Industry,topind_layoffs.Companies)
plt.title('Industry with the most Layoffs')
plt.xlabel('Industry')
plt.ylabel('Number of Companies')
plt.show()

# Exploring Location Data

In [None]:
location = pd.DataFrame(layoff_df.location.value_counts().reset_index())
location

In [None]:
location.columns = ['Location', 'Companies']
location

In [None]:
top_location = location[location['Location'].isin(['SF Bay Area', 'New York City', 'Los Angeles','Boston', 'Seattle'])]

In [None]:
plt.step(top_location.Location,top_location.Companies, color='green')
plt.title('Location with the most Layoffs')
plt.xlabel('Location')
plt.ylabel('Number of Companies')
plt.show()

# Exploring Country Data

In [None]:
country = pd.DataFrame(layoff_df.country.value_counts().reset_index())
country

In [None]:
country.columns = ['Country', 'Companies']
country

In [None]:
top_countries = country[country['Country'].isin(['United States', 'India', 'Canada','Brazil', 'United Kingdom'])]

In [None]:
plt.stackplot(top_countries.Country,top_countries.Companies, color='purple')
plt.title('Country with the most Layoffs')
plt.xlabel('Country')
plt.ylabel('Number of Companies')
plt.show()

In [None]:
# Creating dataset
nations = ['United States', 'India', 'Canada',
           'Brazil', 'United Kingdom']


nation_data = [1062, 101, 69, 29, 52, 46]


# Creating explode data
explode = (0.1, 0.0, 0.2, 0.3, 0.0, 0.0)

# Creating color parameters
colors = ( "orange", "cyan", "brown",
          "grey", "indigo", "beige")

# Wedge properties
wp = { 'linewidth' : 1, 'edgecolor' : "green" }

# Creating autocpt arguments
def func(pct, allvalues):
    absolute = int(pct / 100.*np.sum(allvalues))
    return "{:.1f}%\n({:d} g)".format(pct, absolute)

# Creating plot
fig, ax = plt.subplots(figsize =(10, 7))
wedges, texts, autotexts = ax.pie(nation_data,
                                autopct = lambda pct: func(pct, nation_data),
                                explode = explode,
                                labels = nations,
                                shadow = True,
                                colors = colors,
                                startangle = 90,
                                wedgeprops = wp,
                                textprops = dict(color ="magenta"))

# Adding legend
ax.legend(wedges, nations,
        title ="Countries",
        loc ="center left",
        bbox_to_anchor =(1, 0, 0.5, 1))

plt.setp(autotexts, size = 8, weight ="bold")
ax.set_title("Countries with the most Layoffs")

# show plot
plt.show()


In [None]:
fig = px.pie(layoff_df, values='total_laid_off', names='country',hole=.5,
                    template="plotly_white")
fig.update_traces(textposition='inside',textinfo='percent+label')
fig.update_layout(height=700,
                  title='Percentage of Laid-off Employees By Country')
fig.show()

# Exploring Company Data

In [None]:
top_companies = layoff_df.nlargest(5,['total_laid_off'])
top_companies

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x=top_companies.company, y=top_companies.total_laid_off)
plt.title("Company with the most Layoffs", fontsize=15, fontweight='bold')
plt.xlabel("Company")
plt.ylabel("Number of Layoffs")
plt.show()

In [None]:
nested_pie =  layoff_df.sort_values('total_laid_off', ascending=False).drop_duplicates('country').iloc[:10]

fig = px.sunburst(nested_pie, path=['country','company'], values='total_laid_off',
                  color='total_laid_off',
                  color_continuous_scale='rdbu',template="plotly_white",
                  )
fig.update_layout(height=650, title = "Major Corporations from Ten Nations that have reduced their Workforce" , title_x = 0.47,)
fig.show()

# Exploring Stage Data

In [None]:
layoff_df.stage.unique()

In [None]:
layoff_df.groupby(["stage"]).total_laid_off.sum()

In [None]:
px.bar(layoff_df.groupby(["stage"]).total_laid_off.sum().sort_values(ascending=False),title='Layoffs By Company Stage',
       text_auto=True,orientation='h')

# General Statistics

Let's do a correlation map of all the data

In [None]:
layoff_df.corr()

In [None]:
sns.heatmap(layoff_df.corr(), annot=True, cmap="coolwarm")

Let's get some more info in regards to the number of people laid off.

In [None]:
layoff_df.total_laid_off.mean()

In [None]:
layoff_df.total_laid_off.min()

In [None]:
layoff_df.total_laid_off.max()

In [None]:
layoff_df.total_laid_off.sum()

Now let's do the same for the percent laid off

In [None]:
layoff_df.percentage_laid_off.mean()

In [None]:
layoff_df.percentage_laid_off.min()

In [None]:
layoff_df.percentage_laid_off.max()

In [None]:
layoff_df.percentage_laid_off.sum()

Now let's do it for the amount of funds raised

In [None]:
layoff_df.funds_raised.mean()

In [None]:
layoff_df.funds_raised.min()

In [None]:
layoff_df.funds_raised.max()

In [None]:
layoff_df.funds_raised.sum()

Let's make columns for the day month and year

In [None]:
layoff2 = layoff_df.copy()

layoff2['year']=layoff_df['date'].str.split('-').str[0]
layoff2['month']=layoff_df['date'].str.split('-').str[1]
layoff2['day']=layoff_df['date'].str.split('-').str[2]

layoff2['year']=layoff2['year'].astype(int)
layoff2['month']=layoff2['month'].astype(int)
layoff2['day']=layoff2['day'].astype(int)

# Alternative formula year is df["Year"] = df["date"].map(lambda x : x[:4])

In [None]:
layoff2.head(3)

 Let's turn this dataframe into an excel workbook

In [None]:
xlwriter = pd.ExcelWriter('tech_layoffs.xlsx')  
layoff2.to_excel(xlwriter, sheet_name='layoffs')
xlwriter.close()

# Yearly Data Analysis

In [None]:
layoff2.groupby(["year"],sort=False)["total_laid_off"].max()

In [None]:
g = sns.catplot(x='year', y='total_laid_off', data=layoff2)
g.fig.set_figwidth(10)
g.fig.set_figheight(6) 

In [None]:
top_3_year_wise = layoff2.groupby(['year','company']).total_laid_off.agg([max])
top_3_year_wise

In [None]:
g = top_3_year_wise["max"].groupby(['year'],group_keys=False)
top_3_year_wise2 = g.apply(lambda x : x.sort_values(ascending=False).head(3))
top_3_year_wise2

In [None]:
g.nlargest(3)

In [None]:
top_3_year_wise2.index
top_3_year_wise2.values

In [None]:
top_3_year_wise3 = pd.DataFrame()
top_3_year_wise3["total_laid_off"] = top_3_year_wise2.values
top_3_year_wise3

In [None]:
top_3_year_wise3.columns = ['Number of Layoffs']

In [None]:
y = []
c = []
for i,j in top_3_year_wise2.index:
    y.append(i)
    c.append(j)
top_3_year_wise3["Year"] = y
top_3_year_wise3["Company"] = c
top_3_year_wise3

In [None]:
px.bar(top_3_year_wise3,x='Year',y='Number of Layoffs',color='Company', title='Top 3 Companies with the most Layoffs Year-wise',text_auto=True)

In [None]:
top_3_loction_year_wise = layoff2.groupby(["year","location"]).total_laid_off.agg([max])
top_3_loction_year_wise

In [None]:
gloc = top_3_loction_year_wise.groupby(["year"],group_keys=False)
top_3_loction_year_wise2 = gloc.apply(lambda x : x.sort_values(["max"], ascending=False).head(3)) 
top_3_loction_year_wise2

In [None]:
top_3_loction_year_wise2.values.reshape(9,).tolist()

In [None]:
top_3_loction_year_wise3 = pd.DataFrame()
top_3_loction_year_wise3["total_laid_off"] = top_3_loction_year_wise2.values.reshape(9,).tolist()
top_3_loction_year_wise3

In [None]:
top_3_loction_year_wise3.columns = ['Number of Layoffs']

In [None]:
y = []
l = []
for i,j in top_3_loction_year_wise2.index:
    y.append(i)
    l.append(j)
top_3_loction_year_wise3["Year"] = y
top_3_loction_year_wise3["Location"] = l
top_3_loction_year_wise3

In [None]:
px.bar(top_3_loction_year_wise3,x='Year',y='Number of Layoffs',color='Location', title='Top 3 Locations Year-wise with the most Layoffs',text_auto=True)

In [None]:
total_laid_year_country_wise  = layoff2.groupby(["year","country"]).total_laid_off.sum()
total_laid_year_country_wise 

In [None]:
len(layoff2.country.unique())

In [None]:
total_laid_year_country_wise_year = []
total_laid_year_country_wise_country = []
for i,j in total_laid_year_country_wise.index:
    total_laid_year_country_wise_year.append(i)
    total_laid_year_country_wise_country.append(j)

In [None]:
total_laid_year_country_wise2 = pd.DataFrame({
    "Year": total_laid_year_country_wise_year,
    "Country": total_laid_year_country_wise_country,
    "total_laid_off": total_laid_year_country_wise.values 
})

In [None]:
total_laid_year_country_wise2

In [None]:
total_laid_year_country_wise2.columns = ['Year', 'Country', 'Number of Layoffs']

In [None]:
total_laid_year_country_wise2.sort_values(["Year","Number of Layoffs"],ascending=False,inplace=True)

In [None]:
px.bar(total_laid_year_country_wise2,x='Year',y='Number of Layoffs',color='Country',text='Country',
      title='Layoffs By Country Year-wise'
      )

# United States Data

In [None]:
usa_df = layoff2[layoff2['country']=="United States"]

In [None]:
plt.figure(figsize=(7,7))
sns.countplot(x=usa_df['industry'], data=usa_df)
plt.title('Most Affected Industries in the USA')
plt.xticks(rotation=90)
plt.show()

In [None]:
loc = sns.displot(x=usa_df['location'], data=usa_df, kde=True)
plt.title('Most Affected Locations in the USA')
loc.fig.set_figwidth(10)
loc.fig.set_figheight(6)

In [None]:
usa_companies = usa_df.groupby(['company']).sum().sort_values(['total_laid_off'],ascending=False)
usa_companies

In [None]:
px.bar(usa_companies.iloc[:10,:1], text_auto=True,title='Top 10 companies in the USA with the most Layoffs ')

# Prediction, Classification, & Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import accuracy_score

In [None]:
df = layoff2.fillna(0)
df

In [None]:
numerical_cols = df._get_numeric_data().columns
categorical_cols = list(set(df.columns)-set(numerical_cols))
categorical_cols

In [None]:
df2 = df.drop('date', axis =1)
df2 = df.drop("company", axis = 1) 
df2

Let's turns all the the text values in the column of the dataset into numerical values

Stage

In [None]:
# df2["stage"].unique()

In [None]:
df2["stage"] = df2["stage"].replace({'Series D':1, 'Unknown':2, 'IPO':3, 'Series A':4, 'Series B':5, 'Series C':6,
       'Private Equity':7, 'Acquired':8, 'Series H':9, 'Series F':10, 'Series E':11,
       'Series G':12, 'Seed':13, 'Series J':14, 'Series I':15})
df2

Country

In [None]:
# df2['country'].unique()

In [None]:
df2["country"] = df2["country"].replace({'United States':1, 'Indonesia':2, 'India':3, 'Sweden':4, 'Greece':5,
       'Brazil':6, 'Poland':7, 'Germany':8, 'Israel':9, 'Seychelles':10, 'Norway':11,
       'Canada':12, 'United Kingdom':13, 'Belgium':14, 'Argentina':15, 'Netherlands':16,
       'Denmark':17, 'Hong Kong':18, 'Singapore':19, 'New Zealand':20, 'Australia':21,
       'Malaysia':22, 'Hungary':23, 'Vietnam':24, 'Egypt':25, 'Austria':26, 'Thailand':27,
       'Romania':28, 'Lithuania':29, 'Nigeria':30, 'Kenya':31, 'Chile':32, 'Luxembourg':33,
       'China':34, 'Senegal':35, 'Pakistan':36, 'United Arab Emirates':37, 'Colombia':38,
       'Finland':39, 'Peru':40, 'Ireland':41, 'Bahrain':42, 'Mexico':43, 'Turkey':44,
       'Russia':45, 'Uruguay':46, 'Bulgaria':47, 'France':48, 'Switzerland':49,
       'Estonia':50, 'Portugal':51, 'South Africa':52, 'Czech Republic':53, 'Myanmar':54})
df2["country"].unique()

Industry

In [None]:
# df2["industry"].unique()

In [None]:
df2["industry"] = df2["industry"].replace({'Real Estate':1, 'Transportation':2, 'Consumer':3,'Crypto':4, 'Other':5,
       'Marketing':6, 'Education':7, 'Logistics':8, 'HR':9, 'Finance':10,
       'Healthcare':11, 'Media':12, 'Aerospace':13, 'Sales':4, 'Support':15, 'Security':16,
       'Data':17, 'Retail':18, 'Food':19, 'Travel':20, 'Recruiting':21,'Legal':22,
       'Construction':23, 'Fitness':24, 'Product':25, 'Energy':26, 'Infrastructure':27})

df2["industry"].unique()

Location

In [None]:
# df2["location"].unique()

In [None]:
df2["location"] = df2["location"].replace({'SF Bay Area':0, 'Jakarta':1, 'Boston':2, 'Lehi':3, 'Eindhoven':4, 'Austin':5,
       'Seattle':6, 'Los Angeles':7, 'Columbus':8, 'Chicago':9, 'Bengaluru':10,
       'Stockholm':11, 'Athens':12, 'Sao Paulo':13, 'Minneapolis':14, 'New York City':15,
       'Krakow':16, 'Hyderabad':17, 'Berlin':18, 'Nebraska City':19, 'Copenhagen':20,
       'Vancouver':21, 'Tel Aviv':22, 'Stamford':23, 'Non-U.S.':24, 'Trondheim':25,
       'Grand Rapids':26, 'Oslo':27, 'Toronto':28, 'Pittsburgh':29, 'Düsseldorf':30,
       'Montreal':31, 'Mumbai':32, 'San Luis Obispo':33, 'Jerusalem':34, 'New Delhi':35,
       'Belo Horizonte':36, 'Baltimore':37, 'London':38, 'Philadelphia':39,
       'St. Louis':40, 'Brussels':41, 'Buenos Aires':42, 'Noida':43, 'The Hague':44,
       'Hong Kong':45, 'Singapore':46, 'Birmingham':47, 'Auckland':48,
       'Salt Lake City':49, 'Sydney':50, 'Kuala Lumpur':51, 'Munich':52,
       'Mexico City':53, 'Las Vegas':54, 'Budapest':55, 'Atlanta':56,
       'Ho Chi Minh City':57, 'Winnipeg':58, 'Cairo':59, 'Denver':60, 'Vienna':61,
       'Bangkok':62, 'Vilnius':63, 'Raleigh':64, 'Portland':65, 'Lagos':66, 'Ottawa':67,
       'Richmond':68, 'Bristol':69,'Washington D.C.':70, 'Melbourne':71, 'Saskatoon':72,
       'Brisbane':73, 'Porto Alegre':74, 'Indianapolis':75, 'Nairobi':76, 'Santiago':77,
       'Curitiba':78, 'Dallas':79, 'Miami':80, 'Gurugram':81, 'Luxembourg':82, 'Boulder':83,
       'New Haven':84, 'Victoria':85, 'Beijing':86, 'Waterloo':87, 'Detroit':88,
       'Amsterdam':89, 'Ferdericton':90, 'Dakar':91, 'Florianópolis':92, 'Lahore':93,
       'Santa Barbara':94, 'Shenzen':95, 'Durham':96, 'Louisville':97, 'Hamburg':98,
       'Nashua':99, 'Huntsville':100, 'Dubai':101, 'San Diego':102, 'Spokane':103, 'Bogota':104,
       'Chennai':105, 'Shanghai':106, 'Reno':107, 'Helsinki':108, 'Lima':109, 'Houston':110,
       'Malmo':111, 'Bend':112, 'Dublin':113, 'Selangor':114, 'Manama':115, 'Karachi':116,
       'Bucharest':117, 'Istanbul':118, 'Cincinnati':119, 'Moscow':120, 'Phoenix':121,
       'Davenport':122, 'Calgary':123, 'Nashville':124, 'Montevideo':125, 'Edinburgh':126,
       'Fayetteville':127, 'Madison':128, 'Sofia':129, 'Santa Fe':130, 'Cork':131, 'Paris':132,
       'Ahmedabad':134, 'Joinville':135, 'Hanoi':136, 'Norwalk':137, 'Zurich':138, 'Tallin':139,
       'Dusseldorf':140, 'Lisbon':141, 'Cape Town':142, 'Missoula':143, 'Guadalajara':144,
       'Blumenau':145, 'Milwaukee':146, 'Ann Arbor':147, 'Quebec':148, 'Prague':149, 'Yangon':150,
       'New Orleans':151, 'Sacramento':152, 'Charlotte':153, 'Tampa Bay':154})


df2["location"].unique

In [None]:
df2

Training & Testing

In [None]:
X = df.drop("country", axis =1)
y = df2.iloc[:,-1]
print(X.shape)
print(y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [None]:
y_pred = dt.predict(X_test)
y_pred

In [None]:
y_pred_query= dt.predict([[1,2,1000.0,0.10,1300.0,2]])
y_pred_query

In [None]:
y_test

Decision Tree 

In [None]:
plt.figure(figsize = (25,10))
plot_tree(dt)
plt.title("Layoff Decision Tree", fontsize = 25)

Decision Tree Accuracy

In [None]:
print("Decision Tree accuracy is: {:.2f}".format(accuracy_score(y_test, y_pred)*100))

Bagging

In [None]:
bag = BaggingClassifier(n_estimators=100)
bag.fit(X_train, y_train)

In [None]:
y_pred_bag = bag.predict(X_test)
y_pred_bag

Bagging Accuracy

In [None]:
print("Bagging accuracy score:{:.2f}".format(accuracy_score(y_test, y_pred_bag)*100))

Random Forest Classifier

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
y_pred_rf = rf.predict(X_test)
y_pred_rf

In [None]:
Random Forest  Accuracy

In [None]:
print("Random Forest accuracy is: {:.2f}". format(accuracy_score(y_test, y_pred_rand)*100))

Overall Accuracy Scores

In [None]:
print("Decision Tree accuracy is: {:.2f}".format(accuracy_score(y_test, y_pred)*100))
print("Bagging accuracy score:{:.2f}".format(accuracy_score(y_test, y_pred_bag)*100))
print("Random Forest accuracy is: {:.2f}". format(accuracy_score(y_test, y_pred_rand)*100))

# Autovisualization 

Let's do an autovisualization of all the data

In [None]:
from autoviz.AutoViz_Class import AutoViz_Class

AV = AutoViz_Class()

filename = '../data/layoffs.csv'
AV.AutoViz(filename, depVar='total_laid_off', sep=",", dfte=None, chart_format='svg', max_rows_analyzed=160000, max_cols_analyzed=30)