### This Program is designed for hands on experience difference between Plain Python and Pandas

#### OPTION 1: MOVIES DATASET ANALYSIS USING PLAIN PYTHON

In [15]:
import csv

def calculate_rating_stats(data, industry=None):
    ratings = []

    for row in data:
        if row[3] != 'None' and (not industry or row[1] == industry):
            ratings.append(float(row[3]))

    # IMPORTANT: return AFTER loop, not inside it        
    max_ratings = max(ratings)
    min_ratings = min(ratings)
    avg_ratings = sum(ratings) / len(ratings)

    return max_ratings, min_ratings, avg_ratings


with open("movies.csv") as file:
    data = list(csv.reader(file))

    header = data[:1]
    details = data[1:]   # skip header

max_ratings, min_ratings, avg_ratings = calculate_rating_stats(details)
print(f"----- All Records ----- \nMin Rating = {min_ratings}, Max Rating = {max_ratings}, Average Ratings = {avg_ratings}")


# Calculating how much time it takes for Plain Python to give us stats on movies dataset:
print("\n— — Plain Python Time Duration for stats — -")
%timeit -n 10 -r 5 calculate_rating_stats(details, industry=None)   # 1 ms = 1000 μs    #convert µs → ms by dividing by 1000



----- All Records ----- 
Min Rating = 35.0, Max Rating = 89.0, Average Ratings = 63.72727272727273

— — Plain Python Time Duration for stats — -
8.01 μs ± 251 ns per loop (mean ± std. dev. of 5 runs, 10 loops each)


#### OPTION 2: MOVIE DATASET ANALYSIS USING PANDAS LIBRARY

In [16]:
import pandas as pd

#Getting movie.csv file
df = pd.read_csv("movies.csv")
df  #'df' stands for  DataFrame.  It is tabular data structure, just like any csv file. Think of a DataFrame as an Excel sheet.


Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,RTRating,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,64,$41.94,2008
1,Youth in Revolt,Comedy,The Weinstein Company,52,1.090000,68,$19.62,2010
2,You Will Meet a Tall Dark Stranger,Comedy,Independent,35,1.211818,43,$26.66,2010
3,When in Rome,Comedy,Disney,44,0.000000,15,$43.04,2010
4,What Happens in Vegas,Comedy,Fox,72,6.267647,28,$219.37,2008
...,...,...,...,...,...,...,...,...
72,Across the Universe,romance,Independent,84,0.652603,54,$29.37,2007
73,A Serious Man,Drama,Universal,64,4.382857,89,$30.68,2009
74,A Dangerous Method,Drama,Independent,89,0.448645,79,$8.97,2011
75,27 Dresses,Comedy,Fox,71,5.343622,40,$160.31,2008


In [17]:
#Type of DF
type(df)    #returns pandas.dataframe type
type(df.Genre)  #returns pandas.Series because df.anything become property called series.

pandas.core.series.Series

In [None]:
df.head()  #OR df.head(10)     #will print only few rows

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,RTRating,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,64,$41.94,2008
1,Youth in Revolt,Comedy,The Weinstein Company,52,1.09,68,$19.62,2010
2,You Will Meet a Tall Dark Stranger,Comedy,Independent,35,1.211818,43,$26.66,2010
3,When in Rome,Comedy,Disney,44,0.0,15,$43.04,2010
4,What Happens in Vegas,Comedy,Fox,72,6.267647,28,$219.37,2008


In [19]:
df.tail(10)     #will print last 10 rows. Starting from tail.

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,RTRating,Worldwide Gross,Year
67,Four Christmases,Comedy,Warner Bros.,52,2.022925,26,$161.83,2008
68,Fireproof,Drama,Independent,51,66.934,40,$33.47,2008
69,Enchanted,Comedy,Disney,80,4.005737,93,$340.49,2007
70,Dear John,Drama,Sony,66,4.5988,29,$114.97,2010
71,Beginners,Comedy,Independent,80,4.471875,84,$14.31,2011
72,Across the Universe,romance,Independent,84,0.652603,54,$29.37,2007
73,A Serious Man,Drama,Universal,64,4.382857,89,$30.68,2009
74,A Dangerous Method,Drama,Independent,89,0.448645,79,$8.97,2011
75,27 Dresses,Comedy,Fox,71,5.343622,40,$160.31,2008
76,(500) Days of Summer,comedy,Fox,81,8.096,87,$60.72,2009


In [20]:
#Random printing
df.sample(5)

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,RTRating,Worldwide Gross,Year
60,High School Musical 3: Senior Year,Comedy,Disney,76,22.913136,65,$252.04,2008
75,27 Dresses,Comedy,Fox,71,5.343622,40,$160.31,2008
50,Life as We Know It,Comedy,Independent,62,2.530526,28,$96.16,2010
53,Leap Year,Comedy,Universal,49,1.715263,21,$32.59,2010
18,The Heartbreak Kid,Comedy,Paramount,41,2.129444,30,$127.77,2007


In [21]:
#Indexing
df[18:29]   #similar to python, NumPy indexing

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,RTRating,Worldwide Gross,Year
18,The Heartbreak Kid,Comedy,Paramount,41,2.129444,30,$127.77,2007
19,The Duchess,Drama,Paramount,68,3.20785,60,$43.31,2008
20,The Curious Case of Benjamin Button,Fantasy,Warner Bros.,81,1.783944,73,$285.43,2008
21,The Back-up Plan,Comedy,CBS,47,2.202571,20,$77.09,2010
22,Tangled,Animation,Disney,88,1.365692,89,$355.01,2010
23,Something Borrowed,Romance,Independent,48,1.719514,15,$60.18,2011
24,She's Out of My League,Comedy,Paramount,60,2.4405,57,$48.81,2010
25,Sex and the City Two,Comedy,Warner Bros.,49,2.8835,15,$288.35,2010
26,Sex and the City 2,Comedy,Warner Bros.,49,2.8835,15,$288.35,2010
27,Sex and the City,Comedy,Warner Bros.,81,7.221796,49,$415.25,2008


In [22]:
#Shape. Similar to NumPy `np.shape`
df.shape        #77 rows, 8 columns

(77, 8)

In [24]:
df["RTRating"]   #Return 'Rotten Tomatoes %' Rating Section

0     64
1     68
2     43
3     15
4     28
      ..
72    54
73    89
74    79
75    40
76    87
Name: RTRating, Length: 77, dtype: int64

In [None]:
dir(df.Film)    #In Python You can Print entire function list by just `dir(df.anything)`

['T',
 '_AXIS_LEN',
 '_AXIS_ORDERS',
 '_AXIS_TO_AXIS_NUMBER',
 '_HANDLED_TYPES',
 '__abs__',
 '__add__',
 '__and__',
 '__annotations__',
 '__array__',
 '__array_priority__',
 '__array_ufunc__',
 '__bool__',
 '__class__',
 '__column_consortium_standard__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__firstlineno__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pandas_priority__',
 '__pos__',
 '__pow__',
 '_

In [None]:
#Getting Min, Max and Avg Ratings of Movies:

df.RTRating.min(), df.RTRating.max(), df.RTRating.min()

(np.int64(3), np.int64(96), np.int64(3))

In [None]:
#Conditionl Filtering Selecting From Data:
df_year = df[df.Year==2008]   #will return only for year 2008


In [None]:
#Calculating Rating of Year2008 Movies

df_year.RTRating.min(), df_year.RTRating.max(), df_year.RTRating.mean()


(np.int64(13), np.int64(96), np.float64(53.75))