# Project 1 - Starter Notebook


In [0]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
 
spark = SparkSession.builder.appName("my_project_1").getOrCreate()

Importing all spark data types and spark functions for your convenience.

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [0]:
# Read a CSV into a dataframe
# There is a smarter version, that will first check if there is a Parquet file and use it
def load_csv_file(filename, schema):
  # Reads the relevant file from distributed file system using the given schema

  allowed_files = {'Daily program data': ('Daily program data', "|"),
                   'demographic': ('demographic', "|")}

  if filename not in allowed_files.keys():
    print(f'You were trying to access unknown file \"{filename}\". Only valid options are {allowed_files.keys()}')
    return None

  filepath = allowed_files[filename][0]
  dataPath = f"dbfs:/mnt/coursedata2024/fwm-stb-data/{filepath}"
  delimiter = allowed_files[filename][1]

  df = spark.read.format("csv")\
    .option("header","false")\
    .option("delimiter",delimiter)\
    .schema(schema)\
    .load(dataPath)
  return df

# This dict holds the correct schemata for easily loading the CSVs
schemas_dict = {'Daily program data':
                  StructType([
                    StructField('prog_code', StringType()),
                    StructField('title', StringType()),
                    StructField('genre', StringType()),
                    StructField('air_date', StringType()),
                    StructField('air_time', StringType()),
                    StructField('Duration', FloatType())
                  ]),
                'viewing':
                  StructType([
                    StructField('device_id', StringType()),
                    StructField('event_date', StringType()),
                    StructField('event_time', IntegerType()),
                    StructField('mso_code', StringType()),
                    StructField('prog_code', StringType()),
                    StructField('station_num', StringType())
                  ]),
                'viewing_full':
                  StructType([
                    StructField('mso_code', StringType()),
                    StructField('device_id', StringType()),
                    StructField('event_date', IntegerType()),
                    StructField('event_time', IntegerType()),
                    StructField('station_num', StringType()),
                    StructField('prog_code', StringType())
                  ]),
                'demographic':
                  StructType([StructField('household_id',StringType()),
                    StructField('household_size',IntegerType()),
                    StructField('num_adults',IntegerType()),
                    StructField('num_generations',IntegerType()),
                    StructField('adult_range',StringType()),
                    StructField('marital_status',StringType()),
                    StructField('race_code',StringType()),
                    StructField('presence_children',StringType()),
                    StructField('num_children',IntegerType()),
                    StructField('age_children',StringType()), #format like range - 'bitwise'
                    StructField('age_range_children',StringType()),
                    StructField('dwelling_type',StringType()),
                    StructField('home_owner_status',StringType()),
                    StructField('length_residence',IntegerType()),
                    StructField('home_market_value',StringType()),
                    StructField('num_vehicles',IntegerType()),
                    StructField('vehicle_make',StringType()),
                    StructField('vehicle_model',StringType()),
                    StructField('vehicle_year',IntegerType()),
                    StructField('net_worth',IntegerType()),
                    StructField('income',StringType()),
                    StructField('gender_individual',StringType()),
                    StructField('age_individual',IntegerType()),
                    StructField('education_highest',StringType()),
                    StructField('occupation_highest',StringType()),
                    StructField('education_1',StringType()),
                    StructField('occupation_1',StringType()),
                    StructField('age_2',IntegerType()),
                    StructField('education_2',StringType()),
                    StructField('occupation_2',StringType()),
                    StructField('age_3',IntegerType()),
                    StructField('education_3',StringType()),
                    StructField('occupation_3',StringType()),
                    StructField('age_4',IntegerType()),
                    StructField('education_4',StringType()),
                    StructField('occupation_4',StringType()),
                    StructField('age_5',IntegerType()),
                    StructField('education_5',StringType()),
                    StructField('occupation_5',StringType()),
                    StructField('polit_party_regist',StringType()),
                    StructField('polit_party_input',StringType()),
                    StructField('household_clusters',StringType()),
                    StructField('insurance_groups',StringType()),
                    StructField('financial_groups',StringType()),
                    StructField('green_living',StringType())
                  ])
}

# Read demogrphic data


In [0]:
%%time
# demographic data filename is 'demographic'
demo_df = load_csv_file('demographic', schemas_dict['demographic'])
demo_df.count()
demo_df.printSchema()
print(f'demo_df contains {demo_df.count()} records!')
display(demo_df.limit(6))

root
 |-- household_id: string (nullable = true)
 |-- household_size: integer (nullable = true)
 |-- num_adults: integer (nullable = true)
 |-- num_generations: integer (nullable = true)
 |-- adult_range: string (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- race_code: string (nullable = true)
 |-- presence_children: string (nullable = true)
 |-- num_children: integer (nullable = true)
 |-- age_children: string (nullable = true)
 |-- age_range_children: string (nullable = true)
 |-- dwelling_type: string (nullable = true)
 |-- home_owner_status: string (nullable = true)
 |-- length_residence: integer (nullable = true)
 |-- home_market_value: string (nullable = true)
 |-- num_vehicles: integer (nullable = true)
 |-- vehicle_make: string (nullable = true)
 |-- vehicle_model: string (nullable = true)
 |-- vehicle_year: integer (nullable = true)
 |-- net_worth: integer (nullable = true)
 |-- income: string (nullable = true)
 |-- gender_individual: string (nullable = t

household_id,household_size,num_adults,num_generations,adult_range,marital_status,race_code,presence_children,num_children,age_children,age_range_children,dwelling_type,home_owner_status,length_residence,home_market_value,num_vehicles,vehicle_make,vehicle_model,vehicle_year,net_worth,income,gender_individual,age_individual,education_highest,occupation_highest,education_1,occupation_1,age_2,education_2,occupation_2,age_3,education_3,occupation_3,age_4,education_4,occupation_4,age_5,education_5,occupation_5,polit_party_regist,polit_party_input,household_clusters,insurance_groups,financial_groups,green_living
15,2.0,2.0,1.0,100000000,S,B,,,0,0,S,O,5.0,E,,,,,6.0,4.0,M,60.0,4.0,,,,,,,,,,,,,,,,,D,443,02C3,08C3,
24,2.0,2.0,1.0,100000000000,,W,,,0,0,M,O,,F,,,,,7.0,7.0,F,46.0,3.0,Z,,,,,,,,,,,,,,,,R,223,09O3,03O3,
26,,,,0,,,,,0,0,S,,,F,,,,,,,,,,,,,,,,,,,,,,,,,,,46G,04CG,08CG,
28,3.0,2.0,2.0,110000000000000,S,W,Y,1.0,10000000000000,1000000000,S,O,3.0,H,,,,,5.0,7.0,M,38.0,2.0,4,,,34.0,1.0,7.0,,,,,,,,,,,V,473,11R3,09C3,1.0
35,1.0,1.0,1.0,100000000000,,W,,,0,0,,,,G,,,,,4.0,,M,50.0,2.0,1,,,,,,,,,,,,,,,,D,523,13C3,08C3,
36,,,,0,,,,,0,0,,,,G,,,,,,,,,,,,,,,,,,,,,,,,,,,51G,10RG,10RG,


CPU times: user 12.5 ms, sys: 877 µs, total: 13.4 ms
Wall time: 2.95 s


# Read Daily program data

In [0]:
%%time
# daily_program data filename is 'Daily program data'
daily_prog_df = load_csv_file('Daily program data', schemas_dict['Daily program data'])

daily_prog_df.printSchema()
print(f'daily_prog_df contains {daily_prog_df.count()} records!')
display(daily_prog_df.limit(6))

root
 |-- prog_code: string (nullable = true)
 |-- title: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- air_date: string (nullable = true)
 |-- air_time: string (nullable = true)
 |-- Duration: float (nullable = true)

daily_prog_df contains 13194849 records!


prog_code,title,genre,air_date,air_time,Duration
EP000000250035,21 Jump Street,Crime drama,20151219,50000,60.0
EP000000250035,21 Jump Street,Crime drama,20151219,110000,60.0
EP000000250063,21 Jump Street,Crime drama,20151219,180000,60.0
EP000000510007,A Different World,Sitcom,20151219,100000,30.0
EP000000510008,A Different World,Sitcom,20151219,103000,30.0
EP000000510159,A Different World,Sitcom,20151219,80300,29.0


CPU times: user 129 ms, sys: 9.51 ms, total: 138 ms
Wall time: 8.99 s


# Read viewing data

In [0]:
dataPath = "dbfs:/FileStore/ddm/10m_viewing"

viewing10m_df = spark.read.format("csv")\
    .option("header","true")\
    .option("delimiter",",")\
    .schema(schemas_dict['viewing_full'])\
    .load(dataPath)

display(viewing10m_df.limit(6))
print(f'viewing10m_df contains {viewing10m_df.count()} rows!')

mso_code,device_id,event_date,event_time,station_num,prog_code
1540,0000000050f3,20150222,193802,61812,EP009279780033
1540,0000000050f3,20150222,195314,31709,EP021056430002
1540,0000000050f3,20150222,200151,61812,EP009279780033
1540,000000005518,20150222,111139,46784,EP004891370013
1540,000000005518,20150222,190000,14771,EP012124070127
1540,000000005518,20150222,200000,14771,EP010237320166


viewing10m_df contains 9935852 rows!


# Read reference data

Note that we removed the 'System Type' column.

In [0]:
# Read the new parquet
ref_data_schema = StructType([
    StructField('device_id', StringType()),
    StructField('dma', StringType()),
    StructField('dma_code', StringType()),
    StructField('household_id', IntegerType()),
    StructField('zipcode', IntegerType())
])

# Reading as a Parquet
dataPath = f"dbfs:/FileStore/ddm/ref_data"
ref_data = spark.read.format('parquet') \
                    .option("inferSchema","true")\
                    .load(dataPath)
                    
display(ref_data.limit(6))
print(f'ref_data contains {ref_data.count()} rows!')

device_id,dma,dma_code,household_id,zipcode
0000000050f3,Toledo,547,1471346,43609
000000006785,Amarillo,634,1924512,79119
000000007320,Lake Charles,643,3154808,70634
000000007df9,Lake Charles,643,1924566,70601
000000009595,Lexington,541,1600886,40601
000000009c6a,Houston,618,1924713,77339


ref_data contains 704172 rows!


# Part 2

## 2.1 Identifying Slots

### Query 1 -
Display the top 5 most popular genres, by the amount of people who viewed it (’household size’) and print the total number of people who viewed those genres.

In [0]:
# Selecting only relevant columns to this query from each table

# Save only relevent columns from Reference Data saved as ref_data
q_one_ref_data = ref_data.select('device_id','household_id')
q_one_ref_data = q_one_ref_data.withColumn("household_id", col("household_id").cast(IntegerType()))
q_one_ref_data = q_one_ref_data.filter(col('device_id').isNotNull() & col('household_id').isNotNull()).distinct()

# Save only relevent columns from Daily Program Data saved as daily_prog_df
q_one_daily_prog_df = daily_prog_df.select('prog_code','genre').withColumn('genre_array', split("genre", ",")).\
    drop('genre').withColumn('genre', explode('genre_array')).drop('genre_array')
q_one_daily_prog_df = q_one_daily_prog_df.filter(col('genre').isNotNull() & col('prog_code').isNotNull()).distinct()

# Save only relevent columns from Program Viewing Data saved as viewing10m_df
q_one_viewing10m_df = viewing10m_df.select('device_id', 'prog_code')
q_one_viewing10m_df = q_one_viewing10m_df.filter(col('device_id').isNotNull() & col('prog_code').isNotNull()).distinct()

# Save only relevent columns from Demographic Data saved as demo_df
q_one_demo_df = demo_df.select('household_id', 'household_size')
q_one_demo_df = q_one_demo_df.withColumn("household_id", col("household_id").cast(IntegerType()))
q_one_demo_df = q_one_demo_df.filter(col('household_id').isNotNull() & col('household_size').isNotNull()).distinct()

In [0]:
genres_with_devices = q_one_daily_prog_df.join(q_one_viewing10m_df, 'prog_code', 'inner').drop('prog_code')

geners_with_household_id = genres_with_devices.join(q_one_ref_data, 'device_id', 'inner').drop('device_id')

genres_with_household_size = geners_with_household_id.join(q_one_demo_df, 'household_id', 'inner').distinct()

genres_with_household_size = genres_with_household_size.groupBy('genre').sum('household_size').\
    orderBy(col("sum(household_size)").desc()).limit(5)

display(genres_with_household_size.select('genre', 'sum(household_size)'))

sum_of_households_row = genres_with_household_size.agg(sum("sum(household_size)").alias("total")).first()
sum_of_households = sum_of_households_row["total"]
print(f'Query 1: The total number of people in the top 5 DMAs is {int(sum_of_households):,d}')

genre,sum(household_size)
News,615305
Reality,610476
Talk,537723
Comedy,509672
Sitcom,502753


Query 1: The total number of people in the top 5 DMAs is 2,775,929


### Query 2 -
Display the top 5 most popular DMAs (by amount of devices) and print the total number of people who reside in households in those DMAs.

In [0]:
# Selecting only relevant columns to this query from each table

# Save only relevent columns from Reference Data saved as ref_data
q_two_ref_data = ref_data.select('device_id','household_id', 'dma' , 'dma_code')
q_two_ref_data = q_two_ref_data.withColumn("household_id", col("household_id").cast(IntegerType())).distinct()
q_two_ref_data = q_two_ref_data.filter(col('device_id').isNotNull() & col('household_id').isNotNull() & col('dma').isNotNull() & col('dma_code').isNotNull())

# Save only relevent columns from Demographic Data saved as demo_df
q_two_demo_df = demo_df.select('household_id', 'household_size')
q_two_demo_df = q_two_demo_df.withColumn("household_id", col("household_id").cast(IntegerType()))
q_two_demo_df = q_two_demo_df.filter(col('household_id').isNotNull() & col('household_size').isNotNull()).distinct()

In [0]:
# Taking the top 5 most popular DMAs
top_5_popular_dmas = q_two_ref_data.groupBy('dma_code').count().\
    withColumnRenamed("count", "devices").orderBy(col("devices").desc()).limit(5)

# Joining the top 5 most popular with the reference data in order to get 'household_id'
top_5_popular_dmas = top_5_popular_dmas.join(q_two_ref_data, 'dma_code', 'inner').\
    select('household_id', 'dma' , 'dma_code').distinct()

# Joining the 2 tables above
q_two = top_5_popular_dmas.join(q_two_demo_df, 'household_id', 'inner').\
    select('household_id', 'dma' , 'dma_code','household_size').distinct()

dma_codes_with_household_size = q_two.groupBy('dma_code').sum('household_size').\
    select('dma_code', 'sum(household_size)')

q_two_ref_data_only_dma = q_two_ref_data.select('dma_code', 'dma').distinct()

dma_codes_with_household_size_and_name = dma_codes_with_household_size.join(q_two_ref_data_only_dma , 'dma_code', 'inner')

display(dma_codes_with_household_size_and_name)

sum_of_households_row = dma_codes_with_household_size_and_name.agg(sum("sum(household_size)").alias("total")).first()
sum_of_households = sum_of_households_row["total"]
print(f'Query 2: The total number of people in the top 5 DMAs is {int(sum_of_households):,d}')

dma_code,sum(household_size),dma
577,42844,Wilkes Barre-Scranton-Hztn
819,35124,Seattle-Tacoma
547,24108,Toledo
564,60656,Charleston-Huntington
693,31652,Little Rock-Pine Bluff


Query 2: The total number of people in the top 5 DMAs is 194,384


### Query 3 -
Display the top 5 most popular programs (’prog title’), by the amount of people who viewed it that live households with children present (’presence children’) and print the total number of people who viewed them.

In [0]:
# Selecting only relevant columns to this query from each table

# Save only relevent columns from Reference Data saved as ref_data
q_three_ref_data = ref_data.select('device_id','household_id')
q_three_ref_data = q_three_ref_data.withColumn("household_id", col("household_id").cast(IntegerType()))
q_three_ref_data = q_three_ref_data.filter(col('household_id').isNotNull() & col('device_id').isNotNull()).distinct()

# Save only relevent columns from Daily Program Data saved as daily_prog_df
q_three_daily_prog_df = daily_prog_df.filter(col('title').isNotNull() & col('prog_code').isNotNull()).select('prog_code','title').distinct()

# Save only relevent columns from Program Viewing Data saved as viewing10m_df
q_three_viewing10m_df = viewing10m_df.filter(col('prog_code').isNotNull() & col('device_id').isNotNull()).select('device_id', 'prog_code').distinct()

# Save only relevent columns from Demographic Data saved as demo_df
q_three_demo_df = demo_df.select('household_id', 'household_size', 'presence_children')
q_three_demo_df = q_three_demo_df.withColumn("household_id", col("household_id").cast(IntegerType()))
q_three_demo_df_all = q_three_demo_df.filter(col('household_id').isNotNull() & col('household_size').isNotNull()).distinct()
q_three_demo_df_household_with_kids = q_three_demo_df_all.filter(q_three_demo_df_all.presence_children == 'Y')

In [0]:
title_with_views = q_three_daily_prog_df.join(q_three_viewing10m_df, 'prog_code', 'inner').drop('prog_code').distinct()

prog_code_with_household_id = title_with_views.join(q_three_ref_data, 'device_id', 'inner').drop('device_id').distinct()

top_5_with_household_kids = prog_code_with_household_id.join(q_three_demo_df_household_with_kids, 'household_id', 'inner').\
    groupBy('title').sum('household_size').select('title', 'sum(household_size)').\
    orderBy(col("sum(household_size)").desc()).limit(5)

top_5_plus_household_id = prog_code_with_household_id.join(top_5_with_household_kids , 'title' , 'inner')

prog_code_with_household_size = top_5_plus_household_id.join(q_three_demo_df_all, 'household_id', 'inner').\
    groupBy('title').sum('household_size').select('title', 'sum(household_size)')

display(prog_code_with_household_size.orderBy(col("sum(household_size)").desc()))

sum_of_households_row = prog_code_with_household_size.agg(sum("sum(household_size)").alias("total")).first()
sum_of_households = sum_of_households_row["total"]
print(f'Query 3: The total number of people in the top 5 DMAs is {int(sum_of_households):,d}')

title,sum(household_size)
College Basketball,155682
Paid Programming,147968
SportsCenter,111622
The Big Bang Theory,104211
Today,92618


Query 3: The total number of people in the top 5 DMAs is 612,101


## 2.2 Money and Corruption

In [0]:
# First we will calculate the "wealth score" for each DMA
# Converting 'household_id' to avoid data miss-match
from_ref_data = ref_data.withColumn("household_id", col("household_id").cast(IntegerType()))
from_demo_df = demo_df.withColumn("household_id", col("household_id").cast(IntegerType()))

# Only the relevant columns from 'from_demo_df' & converting letters to numbers to calcculate 'income' data
from_demo_df_income = from_demo_df.select("household_id", "income").dropna()\
    .withColumn("numeric_income",when(col("income") == "A", 10)
                .when(col("income") == "B", 11)
                .when(col("income") == "C", 12)
                .when(col("income") == "D", 13)
                .otherwise(when(col("income").cast("int").isNotNull(), col("income").cast("int"))))\
    .filter(col("numeric_income").isNotNull()).drop("income")\
    .withColumnRenamed("numeric_income", "income")

# Only the relevant columns from 'from_demo_df' to calcculate 'net_worth' data
from_demo_df_net_worth = from_demo_df.select("household_id", "net_worth").dropna()

# Finding MAX income & net_worth values
max_income = from_demo_df_income.select(max('income')).first()[0]
max_net_worth = from_demo_df_net_worth.select(max('net_worth')).first()[0]

# Calculating income & net_worth avarage for each DMA
from_demo_df_income_averege = from_ref_data.join(from_demo_df_income, "household_id", "inner")\
    .groupBy("dma").agg(avg("income").alias("averege_income"))
from_demo_df_net_worth_averege = from_ref_data.join(from_demo_df_net_worth, "household_id", "inner")\
    .groupBy("dma").agg(avg("net_worth").alias("averege_net_worth"))

# Calculating the wealth score for each DMA
dma_wealth_score = from_demo_df_income_averege.join(from_demo_df_net_worth_averege, "dma", "inner") \
    .withColumn("wealth_score", col("averege_net_worth") / max_net_worth + col("averege_income") / max_income) \
    .select("dma", "wealth_score").orderBy(col("wealth_score").desc()).limit(10)

display(dma_wealth_score)

dma,wealth_score
San Antonio,1.623931623931624
San Francisco-Oak-San Jose,1.5112336743183772
Baltimore,1.497726825873881
Sacramnto-Stkton-Modesto,1.437817422163405
"Bend, OR",1.4293804557368408
Austin,1.410291286357595
Houston,1.4027755123331405
Seattle-Tacoma,1.3907189319785074
Miami-Ft. Lauderdale,1.3641203886910258
Detroit,1.3476548506981374


In [0]:
# Finding the amount of all generes
genres_df = daily_prog_df.select('genre').withColumn('genre_array', split("genre", ",")).\
    drop('genre').withColumn('genre', explode('genre_array')).\
    drop('genre_array').distinct()

display(genres_df.count())

# Converting to set:
genres = set(row[0] for row in genres_df.select("genre").collect())

199

In [0]:
# Relevant viewing columns
views = viewing10m_df.select('device_id','prog_code')\
    .filter(col('device_id').isNotNull() & col('prog_code').isNotNull()).distinct()

# Relevant airing columns 
airings = daily_prog_df.select('prog_code','genre').distinct()

# Relevant ref columns
ref = ref_data.select('device_id','dma_code','dma')\
    .filter(col('device_id').isNotNull() & col('household_id').isNotNull() & col('dma_code').isNotNull()).distinct()

In [0]:
# Joining records from all relevant tables and filtering to have only the top 10 DMA's
top_10_with_genres = views.join(airings, 'prog_code', 'inner').\
    join(ref.select('device_id', 'dma'), 'device_id', 'inner').\
    join(dma_wealth_score, 'dma', 'inner').distinct()

final_df = dma_wealth_score.withColumnRenamed("dma", "DMA_NAME").\
    withColumnRenamed("wealth_score", "WEALTH_SCORE").\
    withColumn("ORDERED_LIST_OF_GENRES", array())

process_df = dma_wealth_score
print("'top_10_with_genres' schema:")
top_10_with_genres.printSchema()
print("'final_df' schema:")
final_df.printSchema()
print("'process_df' schema:")
process_df.printSchema()


'top_10_with_genres' schema:
root
 |-- dma: string (nullable = true)
 |-- device_id: string (nullable = true)
 |-- prog_code: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- wealth_score: double (nullable = true)

'final_df' schema:
root
 |-- DMA_NAME: string (nullable = true)
 |-- WEALTH_SCORE: double (nullable = true)
 |-- ORDERED_LIST_OF_GENRES: array (nullable = false)
 |    |-- element: void (containsNull = false)

'process_df' schema:
root
 |-- dma: string (nullable = true)
 |-- wealth_score: double (nullable = true)



In [0]:
# genres is a set dataType that represents all available genres (cell 28)

for i in range(10):
    # Extracting the first DMA
    dma = process_df.select("dma").limit(1).first()[0]
    process_df = process_df.filter(col("dma") != dma)
    # Using "dropDuplicates(['genre' , 'device_id'])" to remove all records with the same genre and device_id (distinct at that point is irelevant)
    dma_genres = top_10_with_genres.filter(col("dma") == dma).\
        withColumn('genre_array', split("genre", ",")).drop('genre').\
        withColumn('genre', explode('genre_array')).drop('genre_array').\
        dropDuplicates(['genre' , 'device_id']).groupBy("genre").\
        agg(count("*").alias("count")).orderBy(col("count").desc()).distinct()

    # Filtering 'dma_genres' removing all unavailable genres and keeping top 11 genres
    dma_available_genres = dma_genres.select("genre").filter(col("genre").isin(genres)).limit(11)

    # Converting to set & updating genres set
    dma_available_genres_list = [row["genre"] for row in dma_available_genres.collect()]
    dma_available_genres_set = set(dma_available_genres_list)
    genres -= dma_available_genres_set

    print(f"DMA {dma} gets genres: {dma_available_genres_list}")
    print(f"Genres remaining: {len(genres)}")

    # Saving the top 11 genres to 'final_df' for the current DMA
    final_df = final_df.withColumn("ORDERED_LIST_OF_GENRES", when(col("DMA_NAME") == dma, array(*[lit(g) for g in dma_available_genres_list])).\
        otherwise(col("ORDERED_LIST_OF_GENRES"))) 

display(final_df.select("DMA_NAME", "WEALTH_SCORE", "ORDERED_LIST_OF_GENRES"))

DMA San Antonio gets genres: []
Genres remaining: 199
DMA San Francisco-Oak-San Jose gets genres: ['Reality', 'News', 'Comedy', 'Music', 'Sitcom', 'Talk', 'Drama', 'Documentary', 'Adventure', 'Children', 'Action']
Genres remaining: 188
DMA Baltimore gets genres: []
Genres remaining: 188
DMA Sacramnto-Stkton-Modesto gets genres: ['Entertainment', 'Crime drama', 'Consumer', 'Animated', 'Newsmagazine', 'Suspense', 'Fantasy', 'Crime', 'Special', 'Mystery', 'Sports event']
Genres remaining: 177
DMA Bend, OR gets genres: ['Shopping', 'Sports non-event', 'Game show', 'House/garden', 'Educational', 'Law', 'Travel', 'Public affairs', 'Interview', 'Cooking', 'How-to']
Genres remaining: 166
DMA Austin gets genres: ['Home improvement', 'Science fiction', 'Politics', 'Basketball', 'Romance', 'Sports talk', 'History', 'Bus./financial', 'Horror', 'Medical', 'Science']
Genres remaining: 155
DMA Houston gets genres: ['Religious', 'Paranormal', 'Soap', 'Animals', 'Outdoors', 'Nature', 'Comedy-drama', 'R

DMA_NAME,WEALTH_SCORE,ORDERED_LIST_OF_GENRES
San Antonio,1.623931623931624,List()
San Francisco-Oak-San Jose,1.5112336743183772,"List(Reality, News, Comedy, Music, Sitcom, Talk, Drama, Documentary, Adventure, Children, Action)"
Baltimore,1.497726825873881,List()
Sacramnto-Stkton-Modesto,1.437817422163405,"List(Entertainment, Crime drama, Consumer, Animated, Newsmagazine, Suspense, Fantasy, Crime, Special, Mystery, Sports event)"
"Bend, OR",1.4293804557368408,"List(Shopping, Sports non-event, Game show, House/garden, Educational, Law, Travel, Public affairs, Interview, Cooking, How-to)"
Austin,1.410291286357595,"List(Home improvement, Science fiction, Politics, Basketball, Romance, Sports talk, History, Bus./financial, Horror, Medical, Science)"
Houston,1.4027755123331405,"List(Religious, Paranormal, Soap, Animals, Outdoors, Nature, Comedy-drama, Romance-comedy, Weather, Golf, Fashion)"
Seattle-Tacoma,1.3907189319785074,"List(Western, Health, Historical drama, Auto, Awards, War, Auto racing, Docudrama, Biography, Fishing, Community)"
Miami-Ft. Lauderdale,1.3641203886910258,List()
Detroit,1.3476548506981374,"List(Hockey, Variety, Football, Musical, Collectibles, Technology, Hunting, Baseball, Parenting, Auction, Anthology)"
