# Importing Dependencies

In [132]:
import pandas as pd               #python package for data analysis
import os                         #python package for interacting with system file
import re                         #python package for dealing with regular expression
import numpy as np                #numpy is dependency of pandas
import neattext as nt             #python package for text cleaning
import spacy                      #premium NLP package for Named Entities Recognition
nlp=spacy.load('en_core_web_sm')  #creating object of spacy model

# Reading Data fram Files in each Directory

In [133]:
folder_list=os.listdir()   #getting list of all the directories

In [134]:
folder_list[1:-1]        #filter our desired directories

['2020 - 2021', '2020-2021']

In [135]:
files_dic={}          #dictionary for files management
files_list=[]         #list for to help in files management using dictionary

In [136]:
for folder in folder_list[1:-1]:        #interating through each directory
    files=os.listdir(folder)             #getting list of files present in a specific directory
    for file in files:                   #iterating each file of a specific directory
        f=open(folder+'/'+file,encoding='utf-8')  #reading each file with UTF-8 encoding
        f=f.read()                               #extacting text from each file
        f=re.sub(file[0:-4],'',f)
        files_list.append(f)
    files_dic[folder]=files_list                #creating dictionary of a directory and its files

In [137]:
files_dic

{'2020 - 2021': [' Data Analytics for the Internet of Things (3).\nConcepts/applications of Data Analytics for loT. Data science, machine learning, and artificial intelligence at the edge. Inference, sensor fusion, bandwidth, transfer learning, and generative models.',
  ' Fundamentals of Data Science (3).\nThis course will teach data science fundamentals to undergraduate non-CS majors. The focus will be on real- world applications and use of associated analysis, Visualization tools, Python programming.\nPrerequisite: None for B.S. or B.A. standing or permission of the instructor.',
  ' Human-Computer Interaction (3).\nHCI foundations, user-centered interaction design, prototyping and programming interactive systems, qualitative and quantitative evaluation techniques, designing multimodal interfaces.',
  ' Introduction to Robot Vision (3).\nPerspective and orthographic projections; the processing of edges, regions, motion, shading, texture, object detection, recognition, and machine le

In [138]:
df=pd.DataFrame(files_dic)   #converting dictionary to data frame

# Files Data Frame of each Directory

In [139]:
df

Unnamed: 0,2020 - 2021,2020-2021
0,Data Analytics for the Internet of Things (3)...,Data Analytics for the Internet of Things (3)...
1,Fundamentals of Data Science (3).\nThis cours...,Fundamentals of Data Science (3).\nThis cours...
2,Human-Computer Interaction (3).\nHCI foundati...,Human-Computer Interaction (3).\nHCI foundati...
3,Introduction to Robot Vision (3).\nPerspectiv...,Introduction to Robot Vision (3).\nPerspectiv...
4,Introduction to Game Theory (3).\nIntroductio...,Introduction to Game Theory (3).\nIntroductio...
...,...,...
210,Bioelectrical Models (3).\nEngineering models...,Bioelectrical Models (3).\nEngineering models...
211,Bioradiation Engineering (3).\nSpectrum of ra...,Bioradiation Engineering (3).\nSpectrum of ra...
212,Circuit Analysis (3). \nIntroductory circuit ...,Circuit Analysis (3). \nIntroductory circuit ...
213,Circuits Lab (1).\nThis lab introduces basic ...,Circuits Lab (1).\nThis lab introduces basic ...


# Text Preprocessing

In [140]:
def average_file_length(file_text):
    '''
    this function find total lines present in the text and file and total words in each
    line of text file, then it return floor division of no_of_words by total_lines
    '''
    avg_file_len=0
    total_lines=len(file_text.split('.'))
    no_of_words=len(file_text.split())
    avg_file_len=no_of_words//total_lines
    return avg_file_len

In [141]:
def average_word_lenght(file_text):
    '''
    this function takes file one by one from each directory and return average no.
    of words. Because no. of words can not be in float so we did floor division 
    '''
    avg_word_len=0
    total_lenght=0
    for word in file_text.split():
        total_lenght+=len(word)
        avg_word_len=total_lenght//len(file_text.split()) #doubel forward slashes(//) used for floor division
    return avg_word_len

In [142]:
def text_cleaning(file_text):
    '''
    this function remove number and unnecessary characters from the text files
    '''
    cleaned_words=[]
    doc=nlp(file_text)
    for word in doc.ents:
        if word.label_ == "CARDINAL" or word.label_ == "DATE":
            pass
        else:
            cleaned_words.append(word.text)
    return " ".join(cleaned_words)

In [143]:
#this loop iterate each file a directory and fing its word lengtha and average length
for col in df.columns:
    df[col+' avg_file_length']=df[col].apply(lambda x:average_file_length(x))
    df[col+' avg_no of words']=df[col].apply(lambda x:average_word_lenght(x))
    df[col]=df[col].apply(lambda x:text_cleaning(str(x)))
    

In [144]:
def find_common_unique_words(df):
    '''
    this function iterate each directory contents and return common and unique words
    among the all the directories.
    '''
    words = set()
    for column in df.columns:
        column_words = set(df[column].str.split().sum())
        words = words.union(column_words)
    common_words = set.intersection(*(set(df[column].str.split().sum()) for column in df.columns))
    unique_words = words - common_words
    summary_dic={"unique_words":unique_words,"common_words":common_words}
    new_df=pd.DataFrame(summary_dic.values(),index=["unique_words","common_words"])
    return new_df

# Final Results

In [145]:
res_df=find_common_unique_words(df[files_dic.keys()])

In [146]:
res_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,292,293,294,295,296,297,298,299,300,301
unique_words,,,,,,,,,,,...,,,,,,,,,,
common_words,Enforces,Relational,Symmetric,Toolkit,ATM,COP2270,Query,Personal,Software,Based,...,Estimation,Intro,UML,Microcomputer,MPI,Controls,3215.0,Optimization,Business,Datacenter


In [147]:
df

Unnamed: 0,2020 - 2021,2020-2021,2020 - 2021 avg_file_length,2020 - 2021 avg_no of words,2020-2021 avg_file_length,2020-2021 avg_no of words
0,Data Analytics Data Analytics loT. Data,Data Analytics Data Analytics loT. Data,6,6,6,6
1,B.S. B.A.,B.S. B.A.,5,5,5,5
2,Human-Computer Interaction,Human-Computer Interaction,7,9,7,9
3,Introduction to Robot Vision MAC,Introduction to Robot Vision MAC,7,6,7,6
4,MAC Permission of the Instructor,MAC Permission of the Instructor,9,6,9,6
...,...,...,...,...,...,...
210,,,8,6,8,6
211,,,7,6,7,6
212,DC AC Laplace MAC CpE Engineering,DC AC Laplace MAC CpE Engineering,8,5,8,5
213,Circuits Lab,Circuits Lab,5,6,5,6
