In [2]:
import re

import numpy as np
import pandas as pd
import plotly.express as px

from difflib import SequenceMatcher

In [3]:
def get_name_vars(code):
    vars_list = {}
    list_lines = code.split('\n')
    list_lines = [i.lstrip() for i in list_lines if ' = ' in i]
    
    for idx, line in enumerate(list_lines):
        var = line.split(' = ')[0]
        if var not in vars_list and '.' not in var and var[0] != '#':
            vars_list[var] = f'v_{idx}'
    return vars_list

def get_name_aliases(code):
    aliases_list = {}
    list_lines = code.split('\n')
    list_lines = [i.lstrip() for i in list_lines if ' as ' in i]
    
    for idx, line in enumerate(list_lines):
        alias = line.split(' as ')[-1]
        if alias not in aliases_list and alias[0] != '#':
            aliases_list[alias] = f'module_{idx}'

    return aliases_list


def replace_names(code, names):
    for key, value in names.items():
        code = re.sub(fr'\b{key}\b', value, code)
    return code


def hash_comparison(s1, s2):
    return int(hash(s1.lower()) == hash(s2.lower()))


def similar(s1, s2):
    return SequenceMatcher(None, s1, s2).ratio()


def normalisation(code):
    module_names = get_name_aliases(code)
    variable_names = get_name_vars(code)
    
    code = replace_names(code, module_names)
    code = replace_names(code, variable_names)
    
    return code.replace('\n', '')

In [4]:
answers = pd.read_csv('5. Python.csv')

In [11]:
name_answers = [i for i in answers.columns if 'Ответ ' in i]

In [12]:
name_answers

['Ответ 1',
 'Ответ 2',
 'Ответ 3',
 'Ответ 4',
 'Ответ 5',
 'Ответ 6',
 'Ответ 7',
 'Ответ 8',
 'Ответ 9',
 'Ответ 10']

In [13]:
# Creating list of names and surnames
students = []
for idx, row in answers.iterrows():
    splited_name = row['Имя'].split()
    if len(splited_name) == 2:
        splited_name = splited_name[1]
    else:
        splited_name = splited_name[0]
        
    students.append(f"{row['Фамилия']} {splited_name}")

In [14]:
students

['Андреев Дмитрий',
 'Чен Юлия',
 'Серова Анастасия',
 'Симонов Евгений',
 'Телелюхин Константин',
 'Костин Даниил',
 'Ляхов Дмитрий',
 'Борисова Анастасия',
 'Дынина Екатерина',
 'Гаранина Софья',
 'Бычков Максим',
 'Магомедэминов Никита',
 'Невоструев Андрей',
 'Воронина Дарья',
 'Кузьмина Валентина',
 'Шаповалова Софья',
 'Гранек Анна',
 'Суворова Галина',
 'Гани Михаил',
 'Булинин Игорь',
 'Илющенко Анна',
 'Сергеенкова Марта',
 'Зеленев Вячеслав',
 'Лебедева Екатерина',
 'Ельчев Пётр',
 'Лакиза Дмитрий',
 'Серёжин Владимир',
 'Павлюченков Сергей',
 'Цхвитария Николас',
 'Овчинникова Милана']

In [18]:
name_answer = name_answers[0]
text_of_answer = answers[name_answer]

text_of_answer = list(map(normalisation, text_of_answer))

corr_matrix = np.zeros((answers.shape[0], answers.shape[0]))

for i, ref_answer in enumerate(text_of_answer):
    for j, checking_answer in enumerate(text_of_answer):
        
        if ref_answer == '-' or checking_answer == '-':
            corr_matrix[i][j] = 0
        else:
            if hash_comparison(ref_answer, checking_answer):
                corr_matrix[i][j] = 1
            else:
                corr_matrix[i][j] = round(similar(ref_answer, checking_answer), 2)

In [19]:
fig = px.imshow(corr_matrix,
                x=students,
                y=students,
                text_auto=True, 
                aspect="auto",
                title=name_answer
               )

fig.update_xaxes(side="top")
fig.update_xaxes(tickangle=-50)
fig.update_layout(autosize=False, width=1200, height=1000)
fig.write_image(f"{name_answer}.jpeg")