# Joins in `pyspark`

Performed with `df_left.join(df_right, how=type_str)`

In [1]:
from pyspark.sql import SparkSession
from more_pyspark import to_pandas
spark = SparkSession.builder.appName('Ops').getOrCreate()
deptk = spark.read.csv("./data/department.csv",  header=True, inferSchema=True)
deptk.collect() >> to_pandas

Unnamed: 0,DeptID,DeptName
0,31,Sales
1,33,Engineering
2,34,Clerical
3,35,Marketing


In [2]:
emplk = spark.read.csv("./data/employee.csv",  header=True, inferSchema=True)
emplk.collect() >> to_pandas

Unnamed: 0,DeptID,LastName
0,31.0,Rafferty
1,33.0,Jones
2,33.0,Heisenberg
3,34.0,Robinson
4,34.0,Smith
5,,Williams


#### Inner join

In [3]:
(emplk.join(deptk, emplk.DeptID == deptk.DeptID, how='inner')
 .collect()) >> to_pandas

Unnamed: 0,DeptID,LastName,DeptName
0,31,Rafferty,Sales
1,33,Jones,Engineering
2,33,Heisenberg,Engineering
3,34,Robinson,Clerical
4,34,Smith,Clerical


#### Left join

In [4]:
(emplk.join(deptk, emplk.DeptID == deptk.DeptID, how='left')
 .collect()) >> to_pandas

Unnamed: 0,DeptID,LastName,DeptName
0,31.0,Rafferty,Sales
1,33.0,Jones,Engineering
2,33.0,Heisenberg,Engineering
3,34.0,Robinson,Clerical
4,34.0,Smith,Clerical
5,,Williams,


#### Right join

In [5]:
(emplk.join(deptk, emplk.DeptID == deptk.DeptID, how='right')
 .collect()) >> to_pandas

Unnamed: 0,DeptID,LastName,DeptName
0,31,Rafferty,Sales
1,33,Heisenberg,Engineering
2,33,Jones,Engineering
3,34,Smith,Clerical
4,34,Robinson,Clerical
5,35,,Marketing


#### Outer join

In [6]:
(emplk.join(deptk, emplk.DeptID == deptk.DeptID, how='outer')
 .collect()) >> to_pandas

Unnamed: 0,DeptID,LastName,DeptName
0,,Williams,
1,35.0,,Marketing
2,34.0,Robinson,Clerical
3,34.0,Smith,Clerical
4,31.0,Rafferty,Sales
5,33.0,Jones,Engineering
6,33.0,Heisenberg,Engineering


## <font color="red"> Exercise 2 </font>

Determine all the players that have hit more than 100 home runs in a season.  The final table should include the players proper name, as well as the team name.  

**Hint:** You will need join the files listed below.  To get credit for this exercise, use the join `pyspark` join methods presented above.

In [8]:
f1, f2, f3 = ("./data/baseball/core/Batting.csv", 
              "./data/baseball/core/People.csv",
              "./data/baseball/core/Teams.csv")

In [28]:
# Your code here
f1k = spark.read.csv(f1,header=True, inferSchema=True)
f1k.columns

['playerID',
 'yearID',
 'stint',
 'teamID',
 'lgID',
 'G',
 'AB',
 'R',
 'H',
 '2B',
 '3B',
 'HR',
 'RBI',
 'SB',
 'CS',
 'BB',
 'SO',
 'IBB',
 'HBP',
 'SH',
 'SF',
 'GIDP']

In [85]:
from pyspark.sql import functions as func
f1k_all = (f1k
     .groupBy(f1k["yearID"],f1k["playerID"])
 .agg(func.sum(f1k["HR"])).withColumnRenamed('sum(HR)', 'sum_HR'))
f1k_50 = f1k_all.filter(f1k_all["sum_HR"]>50)
f1k_50.show()
 

+------+---------+------+
|yearID| playerID|sum_HR|
+------+---------+------+
|  1955| mayswi01|    51|
|  1990|fieldce01|    51|
|  1999| sosasa01|    63|
|  1961|marisro01|    61|
|  2007|rodrial01|    54|
|  1930|wilsoha01|    56|
|  1977|fostege01|    52|
|  1961|mantlmi01|    54|
|  1996|mcgwima01|    52|
|  1947|kinerra01|    51|
|  1998|griffke02|    56|
|  2002|rodrial01|    57|
|  2001|gonzalu01|    57|
|  2006|ortizda01|    54|
|  1997|mcgwima01|    58|
|  1920| ruthba01|    54|
|  1965| mayswi01|    52|
|  1998|mcgwima01|    70|
|  1921| ruthba01|    59|
|  1932| foxxji01|    58|
+------+---------+------+
only showing top 20 rows



In [57]:
f2k = spark.read.csv(f2,header=True, inferSchema=True)
f2k.take(5) >> to_pandas()

Unnamed: 0,playerID,birthYear,birthMonth,birthDay,birthCountry,birthState,birthCity,deathYear,deathMonth,deathDay,...,nameLast,nameGiven,weight,height,bats,throws,debut,finalGame,retroID,bbrefID
0,aardsda01,1981,12,27,USA,CO,Denver,,,,...,Aardsma,David Allan,215,75,R,R,2004-04-06,2015-08-23,aardd001,aardsda01
1,aaronha01,1934,2,5,USA,AL,Mobile,,,,...,Aaron,Henry Louis,180,72,R,R,1954-04-13,1976-10-03,aaroh101,aaronha01
2,aaronto01,1939,8,5,USA,AL,Mobile,1984.0,8.0,16.0,...,Aaron,Tommie Lee,190,75,R,R,1962-04-10,1971-09-26,aarot101,aaronto01
3,aasedo01,1954,9,8,USA,CA,Orange,,,,...,Aase,Donald William,190,75,R,R,1977-07-26,1990-10-03,aased001,aasedo01
4,abadan01,1972,8,25,USA,FL,Palm Beach,,,,...,Abad,Fausto Andres,184,73,L,L,2001-09-10,2006-04-13,abada001,abadan01


In [84]:
top_50_name = f1k_50.join(f2k, f2k.playerID == f1k_50.playerID, how='inner').select(["yearID","nameFirst","nameLast"])
top_50_name.take(10)>>to_pandas()

Unnamed: 0,yearID,nameFirst,nameLast
0,1955,Willie,Mays
1,1990,Cecil,Fielder
2,1999,Sammy,Sosa
3,1961,Roger,Maris
4,2007,Alex,Rodriguez
5,1930,Hack,Wilson
6,1977,George,Foster
7,1961,Mickey,Mantle
8,1996,Mark,McGwire
9,1947,Ralph,Kiner


In [58]:
f3k = spark.read.csv(f3,header=True, inferSchema=True)
f3k.take(5) >> to_pandas()

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,...,DP,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro
0,1871,,BS1,BNA,,3,31,,20,10,...,24,0.834,Boston Red Stockings,South End Grounds I,,103,98,BOS,BS1,BS1
1,1871,,CH1,CNA,,2,28,,19,9,...,16,0.829,Chicago White Stockings,Union Base-Ball Grounds,,104,102,CHI,CH1,CH1
2,1871,,CL1,CFC,,8,29,,10,19,...,15,0.818,Cleveland Forest Citys,National Association Grounds,,96,100,CLE,CL1,CL1
3,1871,,FW1,KEK,,7,19,,7,12,...,8,0.803,Fort Wayne Kekiongas,Hamilton Field,,101,107,KEK,FW1,FW1
4,1871,,NY2,NNA,,5,33,,16,17,...,14,0.84,New York Mutuals,Union Grounds (Brooklyn),,90,88,NYU,NY2,NY2


In [82]:
(f3k.select(["yearID","name"])
.join(top_50_name, f3k.yearID == top_50_name.yearID, how='inner')
 .select(["nameFirst","nameLast","name"])
 .withColumnRenamed('name', 'TeamName')
 .take(50) >> to_pandas)

Unnamed: 0,nameFirst,nameLast,TeamName
0,Willie,Mays,Washington Senators
1,Willie,Mays,St. Louis Cardinals
2,Willie,Mays,Pittsburgh Pirates
3,Willie,Mays,Philadelphia Phillies
4,Willie,Mays,New York Yankees
5,Willie,Mays,New York Giants
6,Willie,Mays,Milwaukee Braves
7,Willie,Mays,Kansas City Athletics
8,Willie,Mays,Detroit Tigers
9,Willie,Mays,Cleveland Indians


## Up Next

Stuff