In [40]:
import pandas as pd
from pyspark import SparkContext
from pyspark.sql import SQLContext
import os

# User



In [3]:
userData = pd.read_csv('Data/u.user', sep = "|", encoding = "iso-8859-1", names =['UserId','Age','Gender','Occupation','Zip'])
userData.head(5)

Unnamed: 0,UserId,Age,Gender,Occupation,Zip
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


# Movie


In [4]:
genre = pd.read_csv('Data/u.genre', sep = "|")
genre.head(20)

genre_list = list(pd.Series(genre['unknown']))

In [11]:
movieInfo = pd.read_csv('Data/u.item', sep = "|", encoding = "iso-8859-1", names=['MovieId','Title','Date','RealeseDate', 'VideoRDate','IMDB']+genre_list)
requiredInfo = movieInfo.drop(['Date', 'RealeseDate','VideoRDate','IMDB'], axis =1)
movieInfo.head(5)

Unnamed: 0,MovieId,Title,Date,RealeseDate,VideoRDate,IMDB,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [10]:
indexedMovies = requiredInfo.drop(genre_list, axis=1)
indexedMovies.head(5)

In [21]:
columnPurpose = requiredInfo.drop(['MovieId','Title'],axis = 1)

In [43]:
requiredInfo.to_csv("ProcessedData/requiredInfo.csv", index=False)
columnPurpose.to_csv("ProcessedData/movie_genre.csv", index=False)
indexedMovies.to_csv("ProcessedData/indexedMovies.csv", index=False)

# User-Ratings


In [24]:
def mapper1(row):
    user, movie, rating, date = row.split("\t")
    return((user,["%s-%s" %(movie,rating)]))

In [59]:
def sortMoviesByRatings(row):
    movies = row[1]
    movies.sort(key = lambda x: -int(x[-1]))
    return (row[0],"\t".join(movies))

In [60]:
sc = SparkContext.getOrCreate()

ratings = sc.textFile('Data/u.data').map(mapper1)
ratings.collect()
ratings = ratings.reduceByKey(lambda x,y: x+y)



In [61]:
ratings = ratings.map(sortMoviesByRatings)
ratings.collect()

[('22',
  '128-5\t258-5\t510-5\t173-5\t186-5\t96-5\t403-5\t435-5\t176-5\t550-5\t238-5\t455-5\t358-5\t208-5\t153-5\t194-5\t168-5\t204-5\t187-5\t515-5\t85-5\t202-5\t184-5\t24-5\t523-5\t181-5\t144-5\t290-5\t89-5\t4-5\t127-5\t250-5\t50-5\t174-5\t80-4\t79-4\t511-4\t227-4\t399-4\t117-4\t999-4\t502-4\t222-4\t393-4\t648-4\t118-4\t692-4\t226-4\t172-4\t230-4\t154-4\t195-4\t651-4\t451-4\t840-4\t431-4\t216-4\t175-4\t109-4\t21-4\t201-4\t568-4\t17-4\t161-4\t712-4\t430-4\t228-4\t732-4\t62-4\t209-4\t526-4\t385-4\t792-4\t68-4\t376-3\t241-3\t433-3\t121-3\t386-3\t407-3\t871-3\t53-3\t167-3\t211-3\t94-3\t636-3\t684-3\t546-3\t1000-3\t566-3\t384-3\t210-3\t731-3\t265-3\t233-3\t231-2\t229-2\t2-2\t377-1\t791-1\t687-1\t456-1\t294-1\t780-1\t29-1\t683-1\t948-1\t110-1\t554-1\t878-1\t997-1\t1001-1\t411-1\t665-1\t996-1\t932-1\t862-1\t105-1\t367-1\t449-1\t988-1\t1003-1\t163-1\t1002-1\t688-1\t926-1\t405-1\t998-1'),
 ('244',
  '154-5\t89-5\t652-5\t238-5\t743-5\t237-5\t509-5\t101-5\t179-5\t208-5\t1098-5\t764-5\t191-5\t10

In [73]:
sqlContext = SQLContext(sc)
combinedratings = sqlContext.createDataFrame(ratings)
combinedratings.createOrReplaceTempView("ratings")

combinedratings.coalesce(1).write.csv('ProcessedData/combinedRatings/')
os.system('mv ./ProcessedData/combinedRatings/*.csv ./ProcessedData/combinedRatings/combinedRatings.csv')


0