# Big Data HW1
## April 2022 
- Efrat Levy 301035184
- Edan Shahmoon 
- Ilan Vasilevsky 322545682

In [1]:
import redis
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
import time
from datetime import datetime,timedelta
import pandas as pd

# Setup
functions for connecting and setting up the databases 

In [2]:
current_env = 'production' # 'docker' for development on docker, 'production' for real server
current_user = 'stud10'
ENV = {"docker": '172.17.0.1',"production":'bdl1.eng.tau.ac.il'}
companiesSet = current_user+":company:names"
open_jobs = current_user+":oj:"
cand_app_set = current_user+":candidate_application"
redis_db_num = 10

### Connections

In [3]:
def connect_to_redis(db_num=redis_db_num,port=6379):
    # We connect to redis on a specific database (db_num) to not interuprt other students
    r = redis.StrictRedis(host=ENV[current_env], port=port, socket_connect_timeout=10,db=db_num)
    r.ping() # send ping to verify that a connection established to redis
    print('connected to redis on "{}", port {}'.format(ENV[current_env], port)) 
    return r

def connect_to_mongo(port=27017):
    if current_env == 'docker':
        client = MongoClient(host=ENV[current_env],port=port, connectTimeoutMS=10000)
        # client.admin.command('ping') # try to ping mongo server
        print('connected to MongoDB on "{}", port {}'.format(ENV[current_env], port))
    
    elif current_env == 'production':
        client = MongoClient()
    return client


### Initializiation 

In [4]:
def setup_mongo(client):
    db = client[current_user]
    companies = db.companies
    print("companies collection created in {} db".format(current_user))
    return db, companies

def reset_redis(r):
    r.flushdb()
    print('redis database number {} is clean'.format(redis_db_num))

def reset_mongo(client):
    client[current_user].companies.drop()
    print("mongo is clean")

def restart():
    # connect to dbs
    r = connect_to_redis()
    client = connect_to_mongo()
    # clean dbs
    reset_redis(r)
    reset_mongo(client)
    # setup the databases
    db, companies = setup_mongo(client)
    print("restart completed")
    return r, client, db, companies

# Operation 1 - Add a new company
We save the company names in Redis for the time efficiency of this operation. The function does the following: <br>
- Checks if company name  exists in redis
- If not, checks if the company exists in Mongo
- If not, the company does not exist so we save the new company to the database.

In [5]:
def is_company_exists(r, companies, company_name):
    # check if company name in comapny set on redis
    if r.sismember(companiesSet, company_name):
        return True
    
    # if not, verify against mongo
    elif companies.find_one({"company_name": company_name}) is not None:
        return True
    
    else:
        return False
    
def add_company(r, companies,company_dict):
    # getting dict for the company and insert it to db
    if 'company_name' not in company_dict:
        # raise ValueError("company dict must have company_name field") #no errors allowed
        print("company dict must have company_name field, no changes were commited to db")
        return None

    else:
        company_name = company_dict['company_name']
    
    # verify that company name is unique
    if(is_company_exists(r, companies, company_name)):
        # raise ValueError("company name already taken") #no errors allowed
        print("company name already taken, no changes were commited to db")
        return None
        
    # and then insert to mongo and redis
    company_dict['jobs_list'] = [] # initiate jobs list
    companies.insert_one(company_dict)
    r.sadd(companiesSet, company_dict['company_name'])
    print("%s Added successfully!"%(company_name))

# Operation 2 - Add a new job position
For time efficiency of operation 6 - we keep the number of open jobs in Redis in a `key:value` format. The key is `location:job title`. <br> This function does the following: <br>
- checks if company exists
- Mongo update: adds job to mongo database
- Redis Update: if the job status is open, increase the value for key location:title by 1. If the key doesn't exist, it creates it and sets the value to 1.

In [6]:
def generate_job_id(companies, company_name):
    match = {'$match' : {'company_name':company_name}}
    project = {'$project': { 'max_id': { '$size':'$jobs_list' }}}
    res = companies.aggregate([match,project])
    return list(res)[0]['max_id'] + 1

def add_job(r, companies, job_dict, company_name):
    # ASSUMPTION: there is no option to delete jobs (so counting jobs can be used to generate job id)
    if not (is_company_exists(r, companies, company_name)):
        print("company doesn't exist")
        return
    
    # generate job id and insert to the company object
    job_dict['job_id'] = generate_job_id(companies, company_name)
    job_dict['application_list'] = [] # initiate application list
    companies.update_one({'company_name': company_name}, {'$push': {'jobs_list': job_dict}}, upsert = True)
    # add job to redis
    if job_dict['status'] == 'open':
        title = job_dict['job_title']
        location = job_dict['location']
        key_string = open_jobs + "%s:%s"%(location,title)
        res = r.incr(key_string)
        print('There are now {} open jobs for: {} in {}'.format(res,title,location))
    
    print("job with id %s was added to %s jobs successfully!"%(job_dict['job_id'], company_name))

# Operation 3 - Add a new application
For time efficiency of operation 5 - we keep the applications of each candidate in a Redis sorted set, where the key is company and the value is a unix timestamp (for time comparisons). <br>
This function does the following:
- Check if job is open
- If open, check if the candidate didn't already submit an application
- If it is a new candidate, create a new application in mongo, and update the corresponding key with a new date if it is more recent than the existing one.

In [7]:
def is_job_open(companies, company_name, job_id):
    res = companies.find_one(
        { 
            "company_name": company_name,
        },
        { "jobs_list": { "$elemMatch": { "job_id": int(job_id), "status": "open" }}}
    )
    return('jobs_list' in res)

def is_already_submitted(companies, company_name, job_id, email):
    res = companies.find_one(
        { 
            "company_name": company_name,
        },
        { "jobs_list": { "$elemMatch": { "job_id": int(job_id), "application_list": {"$elemMatch": {'email':email}} }}}
    )
    return('jobs_list' in res) # if mail doesn't exists an empty object will returned

def new_application(r, companies, candidate, application_time, job_id, company_name):
    if (not is_job_open(companies, company_name, job_id)):
        print("you are trying to apply to a closed job")
        return -1

    if (is_already_submitted(companies, company_name, job_id, candidate['email'])):
        print("you have already sent application for this job")
        return -2

    # update in mongo
    d = datetime.strptime(application_time, "%d-%m-%Y %H:%M:%S")
    candidate['application_date'] = d
    companies.update_one({"company_name": company_name,"jobs_list": {"$elemMatch":{"job_id":int(job_id)}}},{'$push':{'jobs_list.$.application_list':candidate}})

    #update in redis
    unix_d = time.mktime(d.timetuple())
    #check if company already exists in redis
    key_string = "candidate_applications:{}".format(candidate['email'])
    comps = r.zrange(key_string,0,-1,withscores=True)
    comp = [x for x in comps if x[0]==company_name]
    # if comany already exists - change vlaue to most recent date
    if len(comp)>0:
        old_timestamp = comp[0][1]
        if unix_d>old_timestamp:
            r.zadd(key_string,unix_d,company_name)
    else:
        r.zadd(key_string,unix_d,company_name)
        
    print("{} submited application for job number {} at {}".format(candidate['candidate_name'],job_id, company_name))
    return 1

# Operation 4 - Update job status
This function does the following:
- Check if job exists
- Redis update: decrease value for key location:job_title
- Mongo update: change status for relevant job

In [8]:
def update_job_status(r, companies,company_name, job_id, new_status):
    res = companies.aggregate([
    { "$unwind": "$jobs_list"},
    { "$match": {"jobs_list.job_id": int(job_id), 'company_name':'TAU'}},
    { "$project": { "jobs_list.status" : 1, 'jobs_list.location': 1, 'jobs_list.job_title': 1}}
    ])
    
    res = list(res)
    if len(res) == 0:
        print('job id {} does not exist'.format(job_id))
        return -1 

    for job in res:
        old_status = job['jobs_list']['status']
        location = job['jobs_list']['location']
        job_title = job['jobs_list']['job_title']
    
    #update redis
    key_string = open_jobs + "%s:%s"%(location,job_title)
    if old_status == 'open' and new_status == 'close':
        oj_num = r.decr(key_string)
    
    elif old_status == 'close' and new_status == 'open':
        oj_num = r.incr(key_string)
    
    else: 
        oj_num = r.get(key_string)
    
    #update mongo
    companies.update_one(
        {"company_name": company_name,
        "jobs_list": {
            "$elemMatch":{"job_id":int(job_id)}}}, 
        {'$set':{'jobs_list.$.status':new_status}}
        )
    
    print("job number: {} at {} is now: {}".format(job_id, company_name, new_status))
    print('There are now {} open jobs for: {} in {}'.format(oj_num, job_title,location))
    return int(oj_num)

# Operation 5 - show latest companies
Use Redis for O(1) Complexity. 

In [9]:
def show_latest_10_companies(r, candidate_email):
    key_string = "candidate_applications:{}".format(candidate_email)
    res = r.zrevrange(key_string, 0, 9, withscores=False)
    if len(res) == 0: 
        print('No candidate with this email: {}'.format(candidate_email))
    return res

# Operation 6 - show number of open jobs
Use Redis for O(1) Complexity. 

In [10]:
def show_number_of_jobs(r,location,title):
    key_string = open_jobs + "%s:%s"%(location,title)
    result = r.get(key_string)
    return(int(result or 0))

# Report 1
This functions returns a pandas DataFrame with number of open and closed jobs, for each company. It is using the Mongo database for this purpose. <br>
We tried sorting values in mongo instead of in pandas, but it didn't work properly. We believe it is due to old version of mongo. We left commented code in solution so you can see we tried.

In [11]:
def count_jobs_by_company(companies):
    project = {
    '$project':{
        'company_name':1,
        '_id':0,
        'open_jobs':{
            '$size':{
                '$filter' :{
                    'input': '$jobs_list',
                    'as':'oj',
                    'cond':{'$eq':['$$oj.status','open']}
                    }
                }
            },
        'closed_jobs':{
            '$size':{
                '$filter' :{
                    'input': '$jobs_list',
                    'as':'cj',
                    'cond':{'$eq':['$$cj.status','close']}
                    }
                }
            }
        }
    }
    # sort1 = {'$sort':{'open_jobs':-1,'closed_jobs':-1}} #this doesn't work as expected
    # res = companies.aggregate([project,sort])
    res = companies.aggregate([project])
    df = pd.DataFrame(list(res))
    
    try:
        # arange column names for aesthitcal reasons 
        ordered_cols = ['company_name','open_jobs','closed_jobs']
        df = df[ordered_cols]
        # sort values
        df.sort_values(by=['open_jobs','closed_jobs'],ascending=False,inplace=True)
    except KeyError:
        print('No job at any company exists - no records returned')
        return None
    
    return df

# Report 2
This functions returns a pandas DataFrame with number of candidates for each job. It is using the Mongo database for this purpose.

In [12]:
def count_candidates_by_job(companies):
    today = datetime.now()
    from_date = today - timedelta(days=20)
    match = {'$match':{'company_name':'TAU'}}
    unwind_jobs = {'$unwind':'$jobs_list'}
    unwind_apps = {'$unwind':'$jobs_list.application_list'}
    match_date =   {'$match':{
        'jobs_list.application_list.application_date':{
            '$gte': from_date
        }
    }}
    group = {'$group':{'_id':'$jobs_list.job_id','applications':{'$sum':1}}}
    res = companies.aggregate([match,unwind_jobs,unwind_apps,match_date,group])
    #Convert result to dataframe 
    df = pd.DataFrame(list(res))
    df.rename(columns={'_id':'job_id'},inplace=True)
    return df

# Recovery

In [13]:
def recovery(r, companies):
    r = connect_to_redis()
    companies_list = list(companies.find())
    c = len(companies_list)
    j = 0
    a = 0
    for comp in companies_list:
        company_name = comp['company_name']
        for job in comp['jobs_list']:
                j+=1
                status = job['status']
                location = job['location']
                job_title = job['job_title']
                if status == 'open':
                    key_string = open_jobs + "%s:%s"%(location,job_title)
                    r.incr(key_string)

                applications = job['application_list']

                for app in applications:
                    a+=1
                    email = app['email']
                    application_date = app['application_date']

                    unix_d = time.mktime(application_date.timetuple())
                    key_string = "candidate_applications:{}".format(email)
                    r.zadd(key_string, unix_d, company_name)
    
    return {'comapnies':c,'jobs':j,'applications':a}

# Run all operations

In [16]:
def execute():
    print('Initialization')
    print('--------------------------------')
    #connection
    r = connect_to_redis(redis_db_num)
    client = connect_to_mongo()

    # clean the databases
    reset_redis(r)
    reset_mongo(client)

    # setup the databases
    db, companies = setup_mongo(client)
    print
    print('Operation 1 - Add a new company')
    print('--------------------------------')
    add_company(r,companies,
    {'company_name':'TAU', 'company_description':'University'})

    print
    print('Operation 2 - Add a new job position')
    print('--------------------------------')
    add_job(r,companies,
    {'job_title':'bi developer', 'location': 'Tel Aviv','requirements':['python','big data','mongodb'],'status':'open','publish_date':'01-02-2020'},'TAU')

    print
    print ('Operation 3 - Add a new application')
    print('--------------------------------')
    op3_status = new_application(r,companies,
    {'candidate_name':'laura', 'email':'laura@gmail.com','linkedin':'https://www.linkedin.com/in/laura/', 'skills': ['python','sql']},'01-02-2020 15:00:00', '1','TAU')

    print
    print ('Operation 4 - Update job status')
    print('--------------------------------')
    update_job_status(r, companies, 'TAU','1','close')
    update_job_status(r, companies, 'TAU','1','open')

    print
    print ('Operation 5 - Show latest companies')
    print('--------------------------------')
    latest_comps = show_latest_10_companies(r, 'lebron@gmail.com')
    if len(latest_comps)>0:
        print(latest_comps)

    print
    print ('Operation 6 - Show number of open jobs')
    print('--------------------------------')
    open_jobs = show_number_of_jobs(r, 'Tel Aviv','bi developer')
    print('Number of open jobs : {}'.format(open_jobs))

    print
    print ('Report 1')
    print('--------------------------------')
    jobs_by_company_df = count_jobs_by_company(companies)
    print(jobs_by_company_df)

    print
    print ('Report 2')
    print('--------------------------------')
    candidates_by_job_df = count_candidates_by_job(companies)
    print(candidates_by_job_df)

    print 
    print ('Recovery')
    print('--------------------------------')
    res = recovery(r,companies)
    print('recovered:')
    for k in res:
        print(str(res[k])+ ' ' + k)

In [17]:
execute()

Initialization
--------------------------------
connected to redis on "bdl1.eng.tau.ac.il", port 6379
redis database number 10 is clean
mongo is clean
companies collection created in stud10 db

Operation 1 - Add a new company
--------------------------------
TAU Added successfully!

Operation 2 - Add a new job position
--------------------------------
There are now 1 open jobs for: bi developer in Tel Aviv
job with id 1 was added to TAU jobs successfully!

Operation 3 - Add a new application
--------------------------------
laura submited application for job number 1 at TAU

Operation 4 - Update job status
--------------------------------
job number: 1 at TAU is now: close
There are now 0 open jobs for: bi developer in Tel Aviv
job number: 1 at TAU is now: open
There are now 1 open jobs for: bi developer in Tel Aviv

Operation 5 - Show latest companies
--------------------------------
No candidate with this email: lebron@gmail.com

Operation 6 - Show number of open jobs
---------------