Skip to content

Kramlasor13/Data-Cleaning-Using-MySQL

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

4 Commits
 
 
 
 
 
 
 
 

Repository files navigation

Data-Cleaning-Using-MySQL

Data Layoffs Cleaning and Data Exploratory Project using MySQL

-- Data Cleaning

ALTER TABLE layoffs RENAME TO layoffs_table;

select * from layoffs_table;

-- 1. Remove Duplicates -- 2. Standardize the Data -- 3. Null Values or Blank Values -- 4. Remove any column that are unnecessary

create table layoffs_table2 like layoffs_table;

select * from layoffs_table2;

insert into layoffs_table2 select * from layoffs_table;

select *, row_number() over (partition by company,industry, total_laid_off, percentage_laid_off, date) as row_num from layoffs_table2;

with duplicate_cte as ( select *, row_number() over (partition by company,location, industry, total_laid_off, percentage_laid_off, date,stage, country, funds_raised_millions) as row_num from layoffs_table2 ) select * from duplicate_cte where row_num > 1;

select * from layoffs_table2 where company = 'Casper';

with duplicate_cte as ( select *, row_number() over (partition by company,location, industry, total_laid_off, percentage_laid_off, date,stage, country, funds_raised_millions) as row_num from layoffs_table2 ) delete from duplicate_cte where row_num > 1;

CREATE TABLE layoffs_table3 ( company text, location text, industry text, total_laid_off int DEFAULT NULL, percentage_laid_off text, date text, stage text, country text, funds_raised_millions int DEFAULT NULL, row_num int ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;

select * from layoffs_table3;

insert into layoffs_table3 select *, row_number() over (partition by company,location, industry, total_laid_off, percentage_laid_off, date,stage, country, funds_raised_millions) as row_num from layoffs_table2;

select * from layoffs_table3 ;

delete from layoffs_table3 where row_num > 1 ;

-- Standardizing Data

select company, trim(company) from layoffs_table3 ;

update layoffs_table3 set company = trim(company);

select * from layoffs_table3 where industry like 'crypto%' ;

update layoffs_table3 set industry = 'Crypto' where industry like 'Crypto%';

select distinct industry from layoffs_table3;

select distinct location from layoffs_table3 order by 1;

select distinct country from layoffs_table3 order by 1;

select * from layoffs_table3 where country like 'United States%';

select distinct country, trim(trailing '.' from country) from layoffs_table3 order by 1;

update layoffs_table3 set country = trim(trailing '.' from country) where country like 'United States%';

select date, str_to_date(date, '%m/%d/%Y') from layoffs_table3;

update layoffs_table3 set date = str_to_date(date, '%m/%d/%Y');

select date from layoffs_table3;

alter table layoffs_table3 modify column date date;

select * from layoffs_table3;

select * from layoffs_table3 where total_laid_off is null and percentage_laid_off is null;

select * from layoffs_table3 where industry is null or industry= '';

select * from layoffs_table3 where company = 'Airbnb';

select in1.industry, in2.industry from layoffs_table3 in1 join layoffs_table3 in2 on in1.company=in2.company where (in1.industry is null or in1.industry = '') and in2.industry is not null;

update layoffs_table3 set industry = null where industry = '';

update layoffs_table3 in1 join layoffs_table3 in2 on in1.company=in2.company set in1.industry = in2.industry where (in1.industry is null) and in2.industry is not null;

select * from layoffs_table3 where industry is null or industry= '';

select * from layoffs_table3 where company like 'Bally%';

select * from layoffs_table3;

select * from layoffs_table3 where total_laid_off is null and percentage_laid_off is null;

delete from layoffs_table3 where total_laid_off is null and percentage_laid_off is null;

alter table layoffs_table3 drop column row_num;

-- exploratory data analysis

select * from layoffs_table3 order by company;

-- i select everything that in the table that has a 100% layoffs and it is ordered by from the highest total laid off to lowest.

select * from layoffs_table3 where percentage_laid_off = 1 order by total_laid_off desc;

-- getting the sum of total laid off of each company

select company, sum(total_laid_off) from layoffs_table3 group by company order by 2 desc;

-- selecting the year that laiding off starts and laiding off end

select min(date), max(date) from layoffs_table3;

-- getting the sum of total laid off of each industry

select industry, sum(total_laid_off) from layoffs_table3 group by industry order by 2 desc;

-- getting the sum of total laid off of each country

select country, sum(total_laid_off) from layoffs_table3 group by country order by 2 desc;

-- getting the sum of total laid off of each date

select date, sum(total_laid_off) from layoffs_table3 group by date order by 1 desc;

-- getting the sum of total laid off of each date (per year)

select year(date), sum(total_laid_off) from layoffs_table3 group by year(date) order by 1 desc;

-- getting the sum of total laid off of each stage

select stage, sum(total_laid_off) from layoffs_table3 group by stage order by 2 desc;

select * from layoffs_table3;

-- getting the sum of total laid off of each year and month without null values

select substring(date, 1,7) as Month, sum(total_laid_off) from layoffs_table3 where substring(date, 1,7) is not null group by Month order by 1 asc;

-- getting the sum of total laid off of each year and month with rolling sum to find out the detailed -- total number of people laid off each year. without null values

with rolling_total as ( select substring(date, 1,7) as Month, sum(total_laid_off) as total_laid from layoffs_table3 where substring(date, 1,7) is not null group by Month order by 1 asc ) select Month,total_laid ,sum(total_laid) over(order by Month) as ROLING_TOTAL from rolling_total;

select company, sum(total_laid_off) from layoffs_table3 group by company order by 2 desc;

-- getting the total laid off each company per year

select company, year(date) as year, sum(total_laid_off) from layoffs_table3 group by company,year order by year;

-- Ranking the companies with the highest total layoffs each year

with Company_year (company, years, total_laid_off) as ( select company, year(date) as year, sum(total_laid_off) from layoffs_table3 group by company,year ) select *, dense_rank() over (partition by years order by total_laid_off desc) as ranking from Company_year where years is not null order by ranking asc;

-- ranking the top 5 companies with the highest total layoffs each year.

with Company_year (company, years, total_laid_off) as ( select company, year(date) as year, sum(total_laid_off) from layoffs_table3 group by company,year ), company_year_rank as ( select *, dense_rank() over (partition by years order by total_laid_off desc) as ranking from Company_year where years is not null ) select * from company_year_rank where ranking <= 5;

-- Ranking the industry with the highest total layoffs each year

with Industry_year (industry, years, total_laid_off) as ( select industry, year(date) as year, sum(total_laid_off) from layoffs_table3 group by industry,year ) select *, dense_rank() over (partition by years order by total_laid_off desc) as ranking from Industry_year where years is not null order by ranking asc;

-- ranking the top 5 industry with the highest total layoffs each year.

with Industry_year (industry, years, total_laid_off) as ( select industry, year(date) as year, sum(total_laid_off) from layoffs_table3 group by industry,year ), industry_year_rank as ( select *, dense_rank() over (partition by years order by total_laid_off desc) as ranking from Industry_year where years is not null ) select * from industry_year_rank where ranking <= 5;

About

Data Layoffs Cleaning and Data Exploratory Project using MySQL

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published