# Star Wars Survey

# 1. Data extraction and cleaning

In [1]:
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

%matplotlib inline

import seaborn as sns

star_wars = pd.read_csv('StarWars.csv', encoding = "ISO-8859-1")

In [2]:
star_wars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1187 entries, 0 to 1186
Data columns (total 38 columns):
RespondentID                                                                                                                                     1186 non-null float64
Have you seen any of the 6 films in the Star Wars franchise?                                                                                     1187 non-null object
Do you consider yourself to be a fan of the Star Wars film franchise?                                                                            837 non-null object
Which of the following Star Wars films have you seen? Please select all that apply.                                                              674 non-null object
Unnamed: 4                                                                                                                                       572 non-null object
Unnamed: 5                                                       

In [3]:
star_wars.columns

Index(['RespondentID',
       'Have you seen any of the 6 films in the Star Wars franchise?',
       'Do you consider yourself to be a fan of the Star Wars film franchise?',
       'Which of the following Star Wars films have you seen? Please select all that apply.',
       'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8',
       'Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.',
       'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13',
       'Unnamed: 14',
       'Please state whether you view the following characters favorably, unfavorably, or are unfamiliar with him/her.',
       'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19',
       'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23',
       'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27',
       'Unnamed: 28', 'Which character shot first?',
       'Are you familiar with the Expan

In [4]:
star_wars.head()

Unnamed: 0,RespondentID,Have you seen any of the 6 films in the Star Wars franchise?,Do you consider yourself to be a fan of the Star Wars film franchise?,Which of the following Star Wars films have you seen? Please select all that apply.,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.,...,Unnamed: 28,Which character shot first?,Are you familiar with the Expanded Universe?,Do you consider yourself to be a fan of the Expanded Universe?æ,Do you consider yourself to be a fan of the Star Trek franchise?,Gender,Age,Household Income,Education,Location (Census Region)
0,,Response,Response,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,Star Wars: Episode I The Phantom Menace,...,Yoda,Response,Response,Response,Response,Response,Response,Response,Response,Response
1,3292880000.0,Yes,Yes,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,3,...,Very favorably,I don't understand this question,Yes,No,No,Male,18-29,,High school degree,South Atlantic
2,3292880000.0,No,,,,,,,,,...,,,,,Yes,Male,18-29,"$0 - $24,999",Bachelor degree,West South Central
3,3292765000.0,Yes,No,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,,,,1,...,Unfamiliar (N/A),I don't understand this question,No,,No,Male,18-29,"$0 - $24,999",High school degree,West North Central
4,3292763000.0,Yes,Yes,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,5,...,Very favorably,I don't understand this question,No,,Yes,Male,18-29,"$100,000 - $149,999",Some college or Associate degree,West North Central


Above we can see that the first column, "RespondentID", might be blank on some rows. 

Thereby, we must first eliminate the rows where this criterion is met.

In [5]:
star_wars = star_wars[star_wars['RespondentID'].notnull()]

In [6]:
star_wars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1186 entries, 1 to 1186
Data columns (total 38 columns):
RespondentID                                                                                                                                     1186 non-null float64
Have you seen any of the 6 films in the Star Wars franchise?                                                                                     1186 non-null object
Do you consider yourself to be a fan of the Star Wars film franchise?                                                                            836 non-null object
Which of the following Star Wars films have you seen? Please select all that apply.                                                              673 non-null object
Unnamed: 4                                                                                                                                       571 non-null object
Unnamed: 5                                                       

In [7]:
star_wars.head(10)

Unnamed: 0,RespondentID,Have you seen any of the 6 films in the Star Wars franchise?,Do you consider yourself to be a fan of the Star Wars film franchise?,Which of the following Star Wars films have you seen? Please select all that apply.,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.,...,Unnamed: 28,Which character shot first?,Are you familiar with the Expanded Universe?,Do you consider yourself to be a fan of the Expanded Universe?æ,Do you consider yourself to be a fan of the Star Trek franchise?,Gender,Age,Household Income,Education,Location (Census Region)
1,3292880000.0,Yes,Yes,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,3.0,...,Very favorably,I don't understand this question,Yes,No,No,Male,18-29,,High school degree,South Atlantic
2,3292880000.0,No,,,,,,,,,...,,,,,Yes,Male,18-29,"$0 - $24,999",Bachelor degree,West South Central
3,3292765000.0,Yes,No,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,,,,1.0,...,Unfamiliar (N/A),I don't understand this question,No,,No,Male,18-29,"$0 - $24,999",High school degree,West North Central
4,3292763000.0,Yes,Yes,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,5.0,...,Very favorably,I don't understand this question,No,,Yes,Male,18-29,"$100,000 - $149,999",Some college or Associate degree,West North Central
5,3292731000.0,Yes,Yes,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,5.0,...,Somewhat favorably,Greedo,Yes,No,No,Male,18-29,"$100,000 - $149,999",Some college or Associate degree,West North Central
6,3292719000.0,Yes,Yes,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,1.0,...,Very favorably,Han,Yes,No,Yes,Male,18-29,"$25,000 - $49,999",Bachelor degree,Middle Atlantic
7,3292685000.0,Yes,Yes,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,6.0,...,Very favorably,Han,Yes,No,No,Male,18-29,,High school degree,East North Central
8,3292664000.0,Yes,Yes,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,4.0,...,Very favorably,Han,No,,Yes,Male,18-29,,High school degree,South Atlantic
9,3292654000.0,Yes,Yes,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,5.0,...,Somewhat favorably,Han,No,,No,Male,18-29,"$0 - $24,999",Some college or Associate degree,South Atlantic
10,3292640000.0,Yes,No,,Star Wars: Episode II Attack of the Clones,,,,,1.0,...,Very favorably,I don't understand this question,No,,No,Male,18-29,"$25,000 - $49,999",Some college or Associate degree,Pacific


Next step is to convert the values from two columns: 

"Have you seen any of the 6 films in the Star Wars franchise?" 

and 

"Do you consider yourself to be a fan of the Star Wars film franchise?". 

First we'll analyze all the answers and then convert them to Boolean values to make the data a bit easier to work with since we will be able to select the rows that are True or False without having to do a string comparison. 

In [8]:
# Including the dropna parameter to count NaN values

star_wars['Have you seen any of the 6 films in the Star Wars franchise?'].value_counts(dropna=False)

Yes    936
No     250
Name: Have you seen any of the 6 films in the Star Wars franchise?, dtype: int64

In [9]:
# Including the dropna parameter to count NaN values

star_wars['Do you consider yourself to be a fan of the Star Wars film franchise?'].value_counts(dropna=False)

Yes    552
NaN    350
No     284
Name: Do you consider yourself to be a fan of the Star Wars film franchise?, dtype: int64

Now we create a dictionary that will be used in the map() function to perform the conversion. The keys of the dictionary represent the values we found above.

In [10]:
yes_no = {
    "Yes": True,
    "No": False  
}

In [11]:
star_wars['Have you seen any of the 6 films in the Star Wars franchise?'] = \
star_wars['Have you seen any of the 6 films in the Star Wars franchise?'].map(yes_no)

star_wars['Do you consider yourself to be a fan of the Star Wars film franchise?'] = \
star_wars['Do you consider yourself to be a fan of the Star Wars film franchise?'].map(yes_no)

Now let's make sure both columns only contain the values True, False, and NaN

In [12]:
# Including the dropna parameter to count NaN values

star_wars['Have you seen any of the 6 films in the Star Wars franchise?'].value_counts(dropna=False)

True     936
False    250
Name: Have you seen any of the 6 films in the Star Wars franchise?, dtype: int64

In [13]:
# Including the dropna parameter to count NaN values

star_wars['Do you consider yourself to be a fan of the Star Wars film franchise?'].value_counts(dropna=False)

True     552
NaN      350
False    284
Name: Do you consider yourself to be a fan of the Star Wars film franchise?, dtype: int64

Next, we are going the change the columns 3-8. 

In [14]:
star_wars.head(2)

Unnamed: 0,RespondentID,Have you seen any of the 6 films in the Star Wars franchise?,Do you consider yourself to be a fan of the Star Wars film franchise?,Which of the following Star Wars films have you seen? Please select all that apply.,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.,...,Unnamed: 28,Which character shot first?,Are you familiar with the Expanded Universe?,Do you consider yourself to be a fan of the Expanded Universe?æ,Do you consider yourself to be a fan of the Star Trek franchise?,Gender,Age,Household Income,Education,Location (Census Region)
1,3292880000.0,True,True,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,3.0,...,Very favorably,I don't understand this question,Yes,No,No,Male,18-29,,High school degree,South Atlantic
2,3292880000.0,False,,,,,,,,,...,,,,,Yes,Male,18-29,"$0 - $24,999",Bachelor degree,West South Central


As we can see, the fourth column represents The Phantom Menace whereas the others represent the following movies.

To make analysis easier, we should change the column title to "seen_X" (where X is the number of the movie) and all values should be booleans as well. 

We will first convert the values, as we did previously, and then change the column names.

In [15]:
star_wars.head()

Unnamed: 0,RespondentID,Have you seen any of the 6 films in the Star Wars franchise?,Do you consider yourself to be a fan of the Star Wars film franchise?,Which of the following Star Wars films have you seen? Please select all that apply.,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.,...,Unnamed: 28,Which character shot first?,Are you familiar with the Expanded Universe?,Do you consider yourself to be a fan of the Expanded Universe?æ,Do you consider yourself to be a fan of the Star Trek franchise?,Gender,Age,Household Income,Education,Location (Census Region)
1,3292880000.0,True,True,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,3.0,...,Very favorably,I don't understand this question,Yes,No,No,Male,18-29,,High school degree,South Atlantic
2,3292880000.0,False,,,,,,,,,...,,,,,Yes,Male,18-29,"$0 - $24,999",Bachelor degree,West South Central
3,3292765000.0,True,False,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,,,,1.0,...,Unfamiliar (N/A),I don't understand this question,No,,No,Male,18-29,"$0 - $24,999",High school degree,West North Central
4,3292763000.0,True,True,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,5.0,...,Very favorably,I don't understand this question,No,,Yes,Male,18-29,"$100,000 - $149,999",Some college or Associate degree,West North Central
5,3292731000.0,True,True,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,5.0,...,Somewhat favorably,Greedo,Yes,No,No,Male,18-29,"$100,000 - $149,999",Some college or Associate degree,West North Central


In [16]:
cols = ['Which of the following Star Wars films have you seen? Please select all that apply.', 'Unnamed: 4', 'Unnamed: 5',
        'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8']

for col in cols:
    a = star_wars[col].value_counts()
    print(a)
    
# This loop allows us to copy the exact string represented in each column. As we can see, 
# "Star Wars: Episode I  The Phantom Menace" has two spaces between "I" and "The Phantom Menace"
# Episode V, though, does not have this feature
# It's very important to analyze this to fill our replacement dictionary below

Star Wars: Episode I  The Phantom Menace    673
Name: Which of the following Star Wars films have you seen? Please select all that apply., dtype: int64
Star Wars: Episode II  Attack of the Clones    571
Name: Unnamed: 4, dtype: int64
Star Wars: Episode III  Revenge of the Sith    550
Name: Unnamed: 5, dtype: int64
Star Wars: Episode IV  A New Hope    607
Name: Unnamed: 6, dtype: int64
Star Wars: Episode V The Empire Strikes Back    758
Name: Unnamed: 7, dtype: int64
Star Wars: Episode VI Return of the Jedi    738
Name: Unnamed: 8, dtype: int64


In [17]:
movie_dict = {
    'Star Wars: Episode I  The Phantom Menace': True,
    'Star Wars: Episode II  Attack of the Clones': True,
    'Star Wars: Episode III  Revenge of the Sith': True,
    'Star Wars: Episode IV  A New Hope': True,
    'Star Wars: Episode V The Empire Strikes Back': True,
    'Star Wars: Episode VI Return of the Jedi': True,
    np.NaN:False    
}

for col in cols:
    star_wars[col] = \
star_wars[col].map(movie_dict)

In [18]:
star_wars.head()

Unnamed: 0,RespondentID,Have you seen any of the 6 films in the Star Wars franchise?,Do you consider yourself to be a fan of the Star Wars film franchise?,Which of the following Star Wars films have you seen? Please select all that apply.,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.,...,Unnamed: 28,Which character shot first?,Are you familiar with the Expanded Universe?,Do you consider yourself to be a fan of the Expanded Universe?æ,Do you consider yourself to be a fan of the Star Trek franchise?,Gender,Age,Household Income,Education,Location (Census Region)
1,3292880000.0,True,True,True,True,True,True,True,True,3.0,...,Very favorably,I don't understand this question,Yes,No,No,Male,18-29,,High school degree,South Atlantic
2,3292880000.0,False,,False,False,False,False,False,False,,...,,,,,Yes,Male,18-29,"$0 - $24,999",Bachelor degree,West South Central
3,3292765000.0,True,False,True,True,True,False,False,False,1.0,...,Unfamiliar (N/A),I don't understand this question,No,,No,Male,18-29,"$0 - $24,999",High school degree,West North Central
4,3292763000.0,True,True,True,True,True,True,True,True,5.0,...,Very favorably,I don't understand this question,No,,Yes,Male,18-29,"$100,000 - $149,999",Some college or Associate degree,West North Central
5,3292731000.0,True,True,True,True,True,True,True,True,5.0,...,Somewhat favorably,Greedo,Yes,No,No,Male,18-29,"$100,000 - $149,999",Some college or Associate degree,West North Central


Next step is to rename the aforementioned columns to make them more intuitive.

As said above, we will change the column title to "seen_3" (where X is the number of the movie).

In that case, the "Revenge of the Sith" column will change from "Unnamed 5" to "seen_3" since it's the third movie in chronological order

In [19]:
col_dict = {
    'Which of the following Star Wars films have you seen? Please select all that apply.':'seen_1',
    'Unnamed: 4':'seen_2',
    'Unnamed: 5':'seen_3',
    'Unnamed: 6':'seen_4',
    'Unnamed: 7':'seen_5',
    'Unnamed: 8':'seen_6'
}

star_wars = star_wars.rename(columns=col_dict)

In [20]:
star_wars.head()

Unnamed: 0,RespondentID,Have you seen any of the 6 films in the Star Wars franchise?,Do you consider yourself to be a fan of the Star Wars film franchise?,seen_1,seen_2,seen_3,seen_4,seen_5,seen_6,Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.,...,Unnamed: 28,Which character shot first?,Are you familiar with the Expanded Universe?,Do you consider yourself to be a fan of the Expanded Universe?æ,Do you consider yourself to be a fan of the Star Trek franchise?,Gender,Age,Household Income,Education,Location (Census Region)
1,3292880000.0,True,True,True,True,True,True,True,True,3.0,...,Very favorably,I don't understand this question,Yes,No,No,Male,18-29,,High school degree,South Atlantic
2,3292880000.0,False,,False,False,False,False,False,False,,...,,,,,Yes,Male,18-29,"$0 - $24,999",Bachelor degree,West South Central
3,3292765000.0,True,False,True,True,True,False,False,False,1.0,...,Unfamiliar (N/A),I don't understand this question,No,,No,Male,18-29,"$0 - $24,999",High school degree,West North Central
4,3292763000.0,True,True,True,True,True,True,True,True,5.0,...,Very favorably,I don't understand this question,No,,Yes,Male,18-29,"$100,000 - $149,999",Some college or Associate degree,West North Central
5,3292731000.0,True,True,True,True,True,True,True,True,5.0,...,Somewhat favorably,Greedo,Yes,No,No,Male,18-29,"$100,000 - $149,999",Some college or Associate degree,West North Central


In [21]:
star_wars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1186 entries, 1 to 1186
Data columns (total 38 columns):
RespondentID                                                                                                                                     1186 non-null float64
Have you seen any of the 6 films in the Star Wars franchise?                                                                                     1186 non-null bool
Do you consider yourself to be a fan of the Star Wars film franchise?                                                                            836 non-null object
seen_1                                                                                                                                           1186 non-null bool
seen_2                                                                                                                                           1186 non-null bool
seen_3                                                               

Next we are going to change the columns 9-14.

Basically, each column represents how the said movie ranks on a scale of 1-6 in terms of the respondent's preference.

For example, the value in column 9 represents the rank of "Phantom Menace" for each person.

Let's visualize 

In [22]:
star_wars.iloc[0:6,9:15]

Unnamed: 0,Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
1,3.0,2.0,1.0,4.0,5.0,6.0
2,,,,,,
3,1.0,2.0,3.0,4.0,5.0,6.0
4,5.0,6.0,1.0,2.0,4.0,3.0
5,5.0,4.0,6.0,2.0,1.0,3.0
6,1.0,4.0,3.0,6.0,5.0,2.0


As we can see above, the first respondent's favorite movie is "Revenge of the Sith" whereas respondent 5 prefers "The Empire Strikes Back".

Now let's change the answers from strings to floats and subsequently rename the columns.

In [23]:
star_wars[star_wars.columns[9:15]] = star_wars[star_wars.columns[9:15]].astype(float)

rank_cols = {
    'Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.':'ranking_phantom_menace',
    'Unnamed: 10':'ranking_attack_clones',
    'Unnamed: 11':'ranking_revenge_sith',
    'Unnamed: 12':'ranking_new_hope',
    'Unnamed: 13':'ranking_empire_strikes',
    'Unnamed: 14':'ranking_return_jedi'
    }

star_wars = star_wars.rename(columns=rank_cols)

In [24]:
star_wars.iloc[0:6,9:15]

Unnamed: 0,ranking_phantom_menace,ranking_attack_clones,ranking_revenge_sith,ranking_new_hope,ranking_empire_strikes,ranking_return_jedi
1,3.0,2.0,1.0,4.0,5.0,6.0
2,,,,,,
3,1.0,2.0,3.0,4.0,5.0,6.0
4,5.0,6.0,1.0,2.0,4.0,3.0
5,5.0,4.0,6.0,2.0,1.0,3.0
6,1.0,4.0,3.0,6.0,5.0,2.0


# 2. Data Analysis

Next step is to find the highest-ranked movie. To do so, we are going to extract the mean for each of the columns above.

Remember that a lower mean is better.

In [25]:
star_wars.iloc[:,9:15].mean().sort_values()

ranking_empire_strikes    2.513158
ranking_return_jedi       3.047847
ranking_new_hope          3.272727
ranking_phantom_menace    3.732934
ranking_attack_clones     4.087321
ranking_revenge_sith      4.341317
dtype: float64

As we can see, the highest-ranked Star Wars movie is "The Empire Strikes Back". 

Also, the survey indicates that the original trilogy movies are better ranked than the newer ones.

Next step is to plot a graph to better visualize those results. Although there are few results, this is a important step specially if we are working with a large number of values.

In [None]:
rankings_graph = star_wars[star_wars.columns[9:15]].mean().plot(kind='bar');

rankings_graph.set_ylabel('Mean Rank');
rankings_graph.set_title('Highest-Ranked Star Wars Movies');
rankings_graph.set_xticklabels(('Phantom Menace', 'Attack of the Clones', 'Revenge of the Sith', 'New Hope', 'Empire Strikes Back', 'Return of the Jedi'));

Next we are going to visualize, both numerically and graphically, how many people watched each movie

In [None]:
star_wars.iloc[:,3:9].sum().sort_values(ascending=False)

In [None]:
seen_rank = star_wars.iloc[:,3:9].sum().plot(kind='bar');

seen_rank.set_title('Star Wars movies ranked by number of viewers');
seen_rank.set_xticklabels(('Phantom Menace', 'Attack of the Clones', 'Revenge of the Sith', 'New Hope', 'Empire Strikes Back', 'Return of the Jedi'));

As we can see, "The Empire Strikes Back" also ranks first this time. 

Moreover, "The Revenge of the Sith" once again represents the bottom value.

The only difference between this graph and the last one is that "A New Hope" and "The Phantom Menace" switched places.

Previously we discovered how the movies ranked compared to each other as well as how many respondents actually watched them.

The next step is to perform the same analysis but with different subgroups such as gender and whether or not the person is a fan of the franchise

In [None]:
males = star_wars[star_wars['Gender'] == 'Male']

females = star_wars[star_wars['Gender'] == 'Female']

In [None]:
fig = plt.figure(figsize=(30,10))


fig.subplots_adjust(wspace=0.5)

ax1 = fig.add_subplot(1,2,1)

ax1 = males[males.columns[9:15]].mean().plot(kind='bar');

ax1.set_title('Highest-Ranked Star Wars Movies (Males)', size=20);
ax1.set_ylabel('Mean Rank (Males)', size=20);
ax1.set_xticklabels(('Phantom Menace', 'Attack of the Clones', 'Revenge of the Sith', 'New Hope', 'Empire Strikes Back', 'Return of the Jedi'), size=20);
ax1.tick_params(labelsize=20)
ax2 = fig.add_subplot(1,2,2)

ax2 = females[females.columns[9:15]].mean().plot(kind='bar');

ax2.set_title('Highest-Ranked Star Wars Movies (Females)', size=20);
ax2.set_ylabel('Mean Rank (Females)', size=20);
ax2.set_xticklabels(('Phantom Menace', 'Attack of the Clones', 'Revenge of the Sith', 'New Hope', 'Empire Strikes Back', 'Return of the Jedi'), size=20);
ax2.tick_params(labelsize=20)

In [None]:
fig = plt.figure(figsize=(30,10))


fig.subplots_adjust(wspace=0.5)

ax1 = fig.add_subplot(1,2,1)

ax1 = males.iloc[:,3:9].sum().plot(kind='bar');
ax1.set_title('Star Wars movies ranked by number of viewers (Males)', size=20);
ax1.set_xticklabels(('Phantom Menace', 'Attack of the Clones', 'Revenge of the Sith', 'New Hope', 'Empire Strikes Back', 'Return of the Jedi'), size=20);
ax1.tick_params(labelsize=20)

ax2 = fig.add_subplot(1,2,2)

ax1 = females.iloc[:,3:9].sum().plot(kind='bar');
ax1.set_title('Star Wars movies ranked by number of viewers (Females)', size=20);
ax2.set_ylabel('Mean Rank (Females)', size=20);
ax2.set_xticklabels(('Phantom Menace', 'Attack of the Clones', 'Revenge of the Sith', 'New Hope', 'Empire Strikes Back', 'Return of the Jedi'), size=20);
ax2.tick_params(labelsize=20)